1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
tcp_v4_init_seq(const struct sk_buff * skb)94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
97 ip_hdr(skb)->saddr,
98 tcp_hdr(skb)->dest,
99 tcp_hdr(skb)->source);
100 }
101
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113
114 if (reuse == 2) {
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
118 */
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 loopback = true;
129 } else
130 #endif
131 {
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
134 loopback = true;
135 }
136 if (!loopback)
137 reuse = 0;
138 }
139
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
146 holder.
147
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
150 */
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
157 * process.
158 *
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
164 */
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168 if (!seq)
169 seq = 1;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 }
174 sock_hold(sktw);
175 return 1;
176 }
177
178 return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 int addr_len)
184 {
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
188 */
189 if (addr_len < sizeof(struct sockaddr_in))
190 return -EINVAL;
191
192 sock_owned_by_me(sk);
193
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196
197 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
205 struct flowi4 *fl4;
206 struct rtable *rt;
207 int err;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
216
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
221 if (!daddr)
222 return -EINVAL;
223 nexthop = inet_opt->opt.faddr;
224 }
225
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 IPPROTO_TCP,
232 orig_sport, orig_dport, sk);
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 return err;
238 }
239
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 ip_rt_put(rt);
242 return -ENETUNREACH;
243 }
244
245 if (!inet_opt || !inet_opt->opt.srr)
246 daddr = fl4->daddr;
247
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
251
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
258 }
259
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
262
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
264 if (inet_opt)
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
273 */
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
276 if (err)
277 goto failure;
278
279 sk_set_txhash(sk);
280
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
283 if (IS_ERR(rt)) {
284 err = PTR_ERR(rt);
285 rt = NULL;
286 goto failure;
287 }
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
291 rt = NULL;
292
293 if (likely(!tp->repair)) {
294 if (!tp->write_seq)
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
297 inet->inet_daddr,
298 inet->inet_sport,
299 usin->sin_port));
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 inet->inet_saddr,
302 inet->inet_daddr);
303 }
304
305 inet->inet_id = prandom_u32();
306
307 if (tcp_fastopen_defer_connect(sk, &err))
308 return err;
309 if (err)
310 goto failure;
311
312 err = tcp_connect(sk);
313
314 if (err)
315 goto failure;
316
317 return 0;
318
319 failure:
320 /*
321 * This unhashes the socket and releases the local port,
322 * if necessary.
323 */
324 tcp_set_state(sk, TCP_CLOSE);
325 ip_rt_put(rt);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
328 return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331
332 /*
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
336 */
tcp_v4_mtu_reduced(struct sock * sk)337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
341 u32 mtu;
342
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 return;
345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 dst = inet_csk_update_pmtu(sk, mtu);
347 if (!dst)
348 return;
349
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
352 */
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
355
356 mtu = dst_mtu(dst);
357
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
362
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
366 * discovery.
367 */
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372
do_redirect(struct sk_buff * skb,struct sock * sk)373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
376
377 if (dst)
378 dst->ops->redirect(dst, sk, skb);
379 }
380
381
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
387
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
390 */
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 } else if (abort) {
394 /*
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
399 */
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
402 }
403 reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406
407 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412 struct sk_buff *skb;
413 s32 remaining;
414 u32 delta_us;
415
416 if (sock_owned_by_user(sk))
417 return;
418
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
420 !icsk->icsk_backoff)
421 return;
422
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
425 return;
426
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434
435 if (remaining > 0) {
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
438 } else {
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
441 */
442 tcp_retransmit_timer(sk);
443 }
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446
447 /*
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
454 *
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
460 *
461 */
462
tcp_v4_err(struct sk_buff * skb,u32 info)463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 struct tcp_sock *tp;
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
471 struct sock *sk;
472 struct request_sock *fastopen;
473 u32 seq, snd_una;
474 int err;
475 struct net *net = dev_net(skb->dev);
476
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
479 inet_iif(skb), 0);
480 if (!sk) {
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 return -ENOENT;
483 }
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
486 return 0;
487 }
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
495 return 0;
496 }
497
498 bh_lock_sock(sk);
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
503 */
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 }
508 if (sk->sk_state == TCP_CLOSE)
509 goto out;
510
511 if (static_branch_unlikely(&ip4_min_ttl)) {
512 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
513 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
514 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
515 goto out;
516 }
517 }
518
519 tp = tcp_sk(sk);
520 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
521 fastopen = rcu_dereference(tp->fastopen_rsk);
522 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
523 if (sk->sk_state != TCP_LISTEN &&
524 !between(seq, snd_una, tp->snd_nxt)) {
525 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
526 goto out;
527 }
528
529 switch (type) {
530 case ICMP_REDIRECT:
531 if (!sock_owned_by_user(sk))
532 do_redirect(skb, sk);
533 goto out;
534 case ICMP_SOURCE_QUENCH:
535 /* Just silently ignore these. */
536 goto out;
537 case ICMP_PARAMETERPROB:
538 err = EPROTO;
539 break;
540 case ICMP_DEST_UNREACH:
541 if (code > NR_ICMP_UNREACH)
542 goto out;
543
544 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
545 /* We are not interested in TCP_LISTEN and open_requests
546 * (SYN-ACKs send out by Linux are always <576bytes so
547 * they should go through unfragmented).
548 */
549 if (sk->sk_state == TCP_LISTEN)
550 goto out;
551
552 WRITE_ONCE(tp->mtu_info, info);
553 if (!sock_owned_by_user(sk)) {
554 tcp_v4_mtu_reduced(sk);
555 } else {
556 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
557 sock_hold(sk);
558 }
559 goto out;
560 }
561
562 err = icmp_err_convert[code].errno;
563 /* check if this ICMP message allows revert of backoff.
564 * (see RFC 6069)
565 */
566 if (!fastopen &&
567 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
568 tcp_ld_RTO_revert(sk, seq);
569 break;
570 case ICMP_TIME_EXCEEDED:
571 err = EHOSTUNREACH;
572 break;
573 default:
574 goto out;
575 }
576
577 switch (sk->sk_state) {
578 case TCP_SYN_SENT:
579 case TCP_SYN_RECV:
580 /* Only in fast or simultaneous open. If a fast open socket is
581 * already accepted it is treated as a connected one below.
582 */
583 if (fastopen && !fastopen->sk)
584 break;
585
586 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
587
588 if (!sock_owned_by_user(sk)) {
589 sk->sk_err = err;
590
591 sk_error_report(sk);
592
593 tcp_done(sk);
594 } else {
595 sk->sk_err_soft = err;
596 }
597 goto out;
598 }
599
600 /* If we've already connected we will keep trying
601 * until we time out, or the user gives up.
602 *
603 * rfc1122 4.2.3.9 allows to consider as hard errors
604 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
605 * but it is obsoleted by pmtu discovery).
606 *
607 * Note, that in modern internet, where routing is unreliable
608 * and in each dark corner broken firewalls sit, sending random
609 * errors ordered by their masters even this two messages finally lose
610 * their original sense (even Linux sends invalid PORT_UNREACHs)
611 *
612 * Now we are in compliance with RFCs.
613 * --ANK (980905)
614 */
615
616 inet = inet_sk(sk);
617 if (!sock_owned_by_user(sk) && inet->recverr) {
618 sk->sk_err = err;
619 sk_error_report(sk);
620 } else { /* Only an error on timeout */
621 sk->sk_err_soft = err;
622 }
623
624 out:
625 bh_unlock_sock(sk);
626 sock_put(sk);
627 return 0;
628 }
629
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)630 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
631 {
632 struct tcphdr *th = tcp_hdr(skb);
633
634 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
635 skb->csum_start = skb_transport_header(skb) - skb->head;
636 skb->csum_offset = offsetof(struct tcphdr, check);
637 }
638
639 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)640 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
641 {
642 const struct inet_sock *inet = inet_sk(sk);
643
644 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
645 }
646 EXPORT_SYMBOL(tcp_v4_send_check);
647
648 /*
649 * This routine will send an RST to the other tcp.
650 *
651 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
652 * for reset.
653 * Answer: if a packet caused RST, it is not for a socket
654 * existing in our system, if it is matched to a socket,
655 * it is just duplicate segment or bug in other side's TCP.
656 * So that we build reply only basing on parameters
657 * arrived with segment.
658 * Exception: precedence violation. We do not implement it in any case.
659 */
660
661 #ifdef CONFIG_TCP_MD5SIG
662 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
663 #else
664 #define OPTION_BYTES sizeof(__be32)
665 #endif
666
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)667 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
668 {
669 const struct tcphdr *th = tcp_hdr(skb);
670 struct {
671 struct tcphdr th;
672 __be32 opt[OPTION_BYTES / sizeof(__be32)];
673 } rep;
674 struct ip_reply_arg arg;
675 #ifdef CONFIG_TCP_MD5SIG
676 struct tcp_md5sig_key *key = NULL;
677 const __u8 *hash_location = NULL;
678 unsigned char newhash[16];
679 int genhash;
680 struct sock *sk1 = NULL;
681 #endif
682 u64 transmit_time = 0;
683 struct sock *ctl_sk;
684 struct net *net;
685
686 /* Never send a reset in response to a reset. */
687 if (th->rst)
688 return;
689
690 /* If sk not NULL, it means we did a successful lookup and incoming
691 * route had to be correct. prequeue might have dropped our dst.
692 */
693 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
694 return;
695
696 /* Swap the send and the receive. */
697 memset(&rep, 0, sizeof(rep));
698 rep.th.dest = th->source;
699 rep.th.source = th->dest;
700 rep.th.doff = sizeof(struct tcphdr) / 4;
701 rep.th.rst = 1;
702
703 if (th->ack) {
704 rep.th.seq = th->ack_seq;
705 } else {
706 rep.th.ack = 1;
707 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
708 skb->len - (th->doff << 2));
709 }
710
711 memset(&arg, 0, sizeof(arg));
712 arg.iov[0].iov_base = (unsigned char *)&rep;
713 arg.iov[0].iov_len = sizeof(rep.th);
714
715 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
716 #ifdef CONFIG_TCP_MD5SIG
717 rcu_read_lock();
718 hash_location = tcp_parse_md5sig_option(th);
719 if (sk && sk_fullsock(sk)) {
720 const union tcp_md5_addr *addr;
721 int l3index;
722
723 /* sdif set, means packet ingressed via a device
724 * in an L3 domain and inet_iif is set to it.
725 */
726 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
727 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
728 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
729 } else if (hash_location) {
730 const union tcp_md5_addr *addr;
731 int sdif = tcp_v4_sdif(skb);
732 int dif = inet_iif(skb);
733 int l3index;
734
735 /*
736 * active side is lost. Try to find listening socket through
737 * source port, and then find md5 key through listening socket.
738 * we are not loose security here:
739 * Incoming packet is checked with md5 hash with finding key,
740 * no RST generated if md5 hash doesn't match.
741 */
742 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
743 ip_hdr(skb)->saddr,
744 th->source, ip_hdr(skb)->daddr,
745 ntohs(th->source), dif, sdif);
746 /* don't send rst if it can't find key */
747 if (!sk1)
748 goto out;
749
750 /* sdif set, means packet ingressed via a device
751 * in an L3 domain and dif is set to it.
752 */
753 l3index = sdif ? dif : 0;
754 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
755 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
756 if (!key)
757 goto out;
758
759
760 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
761 if (genhash || memcmp(hash_location, newhash, 16) != 0)
762 goto out;
763
764 }
765
766 if (key) {
767 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
768 (TCPOPT_NOP << 16) |
769 (TCPOPT_MD5SIG << 8) |
770 TCPOLEN_MD5SIG);
771 /* Update length and the length the header thinks exists */
772 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
773 rep.th.doff = arg.iov[0].iov_len / 4;
774
775 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
776 key, ip_hdr(skb)->saddr,
777 ip_hdr(skb)->daddr, &rep.th);
778 }
779 #endif
780 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
781 if (rep.opt[0] == 0) {
782 __be32 mrst = mptcp_reset_option(skb);
783
784 if (mrst) {
785 rep.opt[0] = mrst;
786 arg.iov[0].iov_len += sizeof(mrst);
787 rep.th.doff = arg.iov[0].iov_len / 4;
788 }
789 }
790
791 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792 ip_hdr(skb)->saddr, /* XXX */
793 arg.iov[0].iov_len, IPPROTO_TCP, 0);
794 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
796
797 /* When socket is gone, all binding information is lost.
798 * routing might fail in this case. No choice here, if we choose to force
799 * input interface, we will misroute in case of asymmetric route.
800 */
801 if (sk) {
802 arg.bound_dev_if = sk->sk_bound_dev_if;
803 if (sk_fullsock(sk))
804 trace_tcp_send_reset(sk, skb);
805 }
806
807 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
808 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
809
810 arg.tos = ip_hdr(skb)->tos;
811 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
812 local_bh_disable();
813 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
814 if (sk) {
815 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
816 inet_twsk(sk)->tw_mark : sk->sk_mark;
817 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_priority : sk->sk_priority;
819 transmit_time = tcp_transmit_time(sk);
820 }
821 ip_send_unicast_reply(ctl_sk,
822 skb, &TCP_SKB_CB(skb)->header.h4.opt,
823 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
824 &arg, arg.iov[0].iov_len,
825 transmit_time);
826
827 ctl_sk->sk_mark = 0;
828 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
829 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
830 local_bh_enable();
831
832 #ifdef CONFIG_TCP_MD5SIG
833 out:
834 rcu_read_unlock();
835 #endif
836 }
837
838 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
839 outside socket context is ugly, certainly. What can I do?
840 */
841
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)842 static void tcp_v4_send_ack(const struct sock *sk,
843 struct sk_buff *skb, u32 seq, u32 ack,
844 u32 win, u32 tsval, u32 tsecr, int oif,
845 struct tcp_md5sig_key *key,
846 int reply_flags, u8 tos)
847 {
848 const struct tcphdr *th = tcp_hdr(skb);
849 struct {
850 struct tcphdr th;
851 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
852 #ifdef CONFIG_TCP_MD5SIG
853 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
854 #endif
855 ];
856 } rep;
857 struct net *net = sock_net(sk);
858 struct ip_reply_arg arg;
859 struct sock *ctl_sk;
860 u64 transmit_time;
861
862 memset(&rep.th, 0, sizeof(struct tcphdr));
863 memset(&arg, 0, sizeof(arg));
864
865 arg.iov[0].iov_base = (unsigned char *)&rep;
866 arg.iov[0].iov_len = sizeof(rep.th);
867 if (tsecr) {
868 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
869 (TCPOPT_TIMESTAMP << 8) |
870 TCPOLEN_TIMESTAMP);
871 rep.opt[1] = htonl(tsval);
872 rep.opt[2] = htonl(tsecr);
873 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
874 }
875
876 /* Swap the send and the receive. */
877 rep.th.dest = th->source;
878 rep.th.source = th->dest;
879 rep.th.doff = arg.iov[0].iov_len / 4;
880 rep.th.seq = htonl(seq);
881 rep.th.ack_seq = htonl(ack);
882 rep.th.ack = 1;
883 rep.th.window = htons(win);
884
885 #ifdef CONFIG_TCP_MD5SIG
886 if (key) {
887 int offset = (tsecr) ? 3 : 0;
888
889 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
890 (TCPOPT_NOP << 16) |
891 (TCPOPT_MD5SIG << 8) |
892 TCPOLEN_MD5SIG);
893 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
894 rep.th.doff = arg.iov[0].iov_len/4;
895
896 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
897 key, ip_hdr(skb)->saddr,
898 ip_hdr(skb)->daddr, &rep.th);
899 }
900 #endif
901 arg.flags = reply_flags;
902 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
903 ip_hdr(skb)->saddr, /* XXX */
904 arg.iov[0].iov_len, IPPROTO_TCP, 0);
905 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
906 if (oif)
907 arg.bound_dev_if = oif;
908 arg.tos = tos;
909 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
910 local_bh_disable();
911 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
912 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
913 inet_twsk(sk)->tw_mark : sk->sk_mark;
914 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
915 inet_twsk(sk)->tw_priority : sk->sk_priority;
916 transmit_time = tcp_transmit_time(sk);
917 ip_send_unicast_reply(ctl_sk,
918 skb, &TCP_SKB_CB(skb)->header.h4.opt,
919 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
920 &arg, arg.iov[0].iov_len,
921 transmit_time);
922
923 ctl_sk->sk_mark = 0;
924 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
925 local_bh_enable();
926 }
927
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)928 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
929 {
930 struct inet_timewait_sock *tw = inet_twsk(sk);
931 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
932
933 tcp_v4_send_ack(sk, skb,
934 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
935 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
936 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
937 tcptw->tw_ts_recent,
938 tw->tw_bound_dev_if,
939 tcp_twsk_md5_key(tcptw),
940 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
941 tw->tw_tos
942 );
943
944 inet_twsk_put(tw);
945 }
946
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)947 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
948 struct request_sock *req)
949 {
950 const union tcp_md5_addr *addr;
951 int l3index;
952
953 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
954 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
955 */
956 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
957 tcp_sk(sk)->snd_nxt;
958
959 /* RFC 7323 2.3
960 * The window field (SEG.WND) of every outgoing segment, with the
961 * exception of <SYN> segments, MUST be right-shifted by
962 * Rcv.Wind.Shift bits:
963 */
964 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
965 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
966 tcp_v4_send_ack(sk, skb, seq,
967 tcp_rsk(req)->rcv_nxt,
968 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
969 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
970 req->ts_recent,
971 0,
972 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
973 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
974 ip_hdr(skb)->tos);
975 }
976
977 /*
978 * Send a SYN-ACK after having received a SYN.
979 * This still operates on a request_sock only, not on a big
980 * socket.
981 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)982 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
983 struct flowi *fl,
984 struct request_sock *req,
985 struct tcp_fastopen_cookie *foc,
986 enum tcp_synack_type synack_type,
987 struct sk_buff *syn_skb)
988 {
989 const struct inet_request_sock *ireq = inet_rsk(req);
990 struct flowi4 fl4;
991 int err = -1;
992 struct sk_buff *skb;
993 u8 tos;
994
995 /* First, grab a route. */
996 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
997 return -1;
998
999 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1000
1001 if (skb) {
1002 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1003
1004 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1005 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1006 (inet_sk(sk)->tos & INET_ECN_MASK) :
1007 inet_sk(sk)->tos;
1008
1009 if (!INET_ECN_is_capable(tos) &&
1010 tcp_bpf_ca_needs_ecn((struct sock *)req))
1011 tos |= INET_ECN_ECT_0;
1012
1013 rcu_read_lock();
1014 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1015 ireq->ir_rmt_addr,
1016 rcu_dereference(ireq->ireq_opt),
1017 tos);
1018 rcu_read_unlock();
1019 err = net_xmit_eval(err);
1020 }
1021
1022 return err;
1023 }
1024
1025 /*
1026 * IPv4 request_sock destructor.
1027 */
tcp_v4_reqsk_destructor(struct request_sock * req)1028 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1029 {
1030 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1031 }
1032
1033 #ifdef CONFIG_TCP_MD5SIG
1034 /*
1035 * RFC2385 MD5 checksumming requires a mapping of
1036 * IP address->MD5 Key.
1037 * We need to maintain these in the sk structure.
1038 */
1039
1040 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1041 EXPORT_SYMBOL(tcp_md5_needed);
1042
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1043 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1044 {
1045 if (!old)
1046 return true;
1047
1048 /* l3index always overrides non-l3index */
1049 if (old->l3index && new->l3index == 0)
1050 return false;
1051 if (old->l3index == 0 && new->l3index)
1052 return true;
1053
1054 return old->prefixlen < new->prefixlen;
1055 }
1056
1057 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1058 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1059 const union tcp_md5_addr *addr,
1060 int family)
1061 {
1062 const struct tcp_sock *tp = tcp_sk(sk);
1063 struct tcp_md5sig_key *key;
1064 const struct tcp_md5sig_info *md5sig;
1065 __be32 mask;
1066 struct tcp_md5sig_key *best_match = NULL;
1067 bool match;
1068
1069 /* caller either holds rcu_read_lock() or socket lock */
1070 md5sig = rcu_dereference_check(tp->md5sig_info,
1071 lockdep_sock_is_held(sk));
1072 if (!md5sig)
1073 return NULL;
1074
1075 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1076 lockdep_sock_is_held(sk)) {
1077 if (key->family != family)
1078 continue;
1079 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1080 continue;
1081 if (family == AF_INET) {
1082 mask = inet_make_mask(key->prefixlen);
1083 match = (key->addr.a4.s_addr & mask) ==
1084 (addr->a4.s_addr & mask);
1085 #if IS_ENABLED(CONFIG_IPV6)
1086 } else if (family == AF_INET6) {
1087 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1088 key->prefixlen);
1089 #endif
1090 } else {
1091 match = false;
1092 }
1093
1094 if (match && better_md5_match(best_match, key))
1095 best_match = key;
1096 }
1097 return best_match;
1098 }
1099 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1100
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1101 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1102 const union tcp_md5_addr *addr,
1103 int family, u8 prefixlen,
1104 int l3index, u8 flags)
1105 {
1106 const struct tcp_sock *tp = tcp_sk(sk);
1107 struct tcp_md5sig_key *key;
1108 unsigned int size = sizeof(struct in_addr);
1109 const struct tcp_md5sig_info *md5sig;
1110
1111 /* caller either holds rcu_read_lock() or socket lock */
1112 md5sig = rcu_dereference_check(tp->md5sig_info,
1113 lockdep_sock_is_held(sk));
1114 if (!md5sig)
1115 return NULL;
1116 #if IS_ENABLED(CONFIG_IPV6)
1117 if (family == AF_INET6)
1118 size = sizeof(struct in6_addr);
1119 #endif
1120 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1121 lockdep_sock_is_held(sk)) {
1122 if (key->family != family)
1123 continue;
1124 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1125 continue;
1126 if (key->l3index != l3index)
1127 continue;
1128 if (!memcmp(&key->addr, addr, size) &&
1129 key->prefixlen == prefixlen)
1130 return key;
1131 }
1132 return NULL;
1133 }
1134
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1135 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1136 const struct sock *addr_sk)
1137 {
1138 const union tcp_md5_addr *addr;
1139 int l3index;
1140
1141 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1142 addr_sk->sk_bound_dev_if);
1143 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1144 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1145 }
1146 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1147
1148 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1149 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1150 int family, u8 prefixlen, int l3index, u8 flags,
1151 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1152 {
1153 /* Add Key to the list */
1154 struct tcp_md5sig_key *key;
1155 struct tcp_sock *tp = tcp_sk(sk);
1156 struct tcp_md5sig_info *md5sig;
1157
1158 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1159 if (key) {
1160 /* Pre-existing entry - just update that one.
1161 * Note that the key might be used concurrently.
1162 * data_race() is telling kcsan that we do not care of
1163 * key mismatches, since changing MD5 key on live flows
1164 * can lead to packet drops.
1165 */
1166 data_race(memcpy(key->key, newkey, newkeylen));
1167
1168 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1169 * Also note that a reader could catch new key->keylen value
1170 * but old key->key[], this is the reason we use __GFP_ZERO
1171 * at sock_kmalloc() time below these lines.
1172 */
1173 WRITE_ONCE(key->keylen, newkeylen);
1174
1175 return 0;
1176 }
1177
1178 md5sig = rcu_dereference_protected(tp->md5sig_info,
1179 lockdep_sock_is_held(sk));
1180 if (!md5sig) {
1181 md5sig = kmalloc(sizeof(*md5sig), gfp);
1182 if (!md5sig)
1183 return -ENOMEM;
1184
1185 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1186 INIT_HLIST_HEAD(&md5sig->head);
1187 rcu_assign_pointer(tp->md5sig_info, md5sig);
1188 }
1189
1190 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1191 if (!key)
1192 return -ENOMEM;
1193 if (!tcp_alloc_md5sig_pool()) {
1194 sock_kfree_s(sk, key, sizeof(*key));
1195 return -ENOMEM;
1196 }
1197
1198 memcpy(key->key, newkey, newkeylen);
1199 key->keylen = newkeylen;
1200 key->family = family;
1201 key->prefixlen = prefixlen;
1202 key->l3index = l3index;
1203 key->flags = flags;
1204 memcpy(&key->addr, addr,
1205 (family == AF_INET6) ? sizeof(struct in6_addr) :
1206 sizeof(struct in_addr));
1207 hlist_add_head_rcu(&key->node, &md5sig->head);
1208 return 0;
1209 }
1210 EXPORT_SYMBOL(tcp_md5_do_add);
1211
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1212 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1213 u8 prefixlen, int l3index, u8 flags)
1214 {
1215 struct tcp_md5sig_key *key;
1216
1217 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1218 if (!key)
1219 return -ENOENT;
1220 hlist_del_rcu(&key->node);
1221 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1222 kfree_rcu(key, rcu);
1223 return 0;
1224 }
1225 EXPORT_SYMBOL(tcp_md5_do_del);
1226
tcp_clear_md5_list(struct sock * sk)1227 static void tcp_clear_md5_list(struct sock *sk)
1228 {
1229 struct tcp_sock *tp = tcp_sk(sk);
1230 struct tcp_md5sig_key *key;
1231 struct hlist_node *n;
1232 struct tcp_md5sig_info *md5sig;
1233
1234 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1235
1236 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1237 hlist_del_rcu(&key->node);
1238 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1239 kfree_rcu(key, rcu);
1240 }
1241 }
1242
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1243 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1244 sockptr_t optval, int optlen)
1245 {
1246 struct tcp_md5sig cmd;
1247 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1248 const union tcp_md5_addr *addr;
1249 u8 prefixlen = 32;
1250 int l3index = 0;
1251 u8 flags;
1252
1253 if (optlen < sizeof(cmd))
1254 return -EINVAL;
1255
1256 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1257 return -EFAULT;
1258
1259 if (sin->sin_family != AF_INET)
1260 return -EINVAL;
1261
1262 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1263
1264 if (optname == TCP_MD5SIG_EXT &&
1265 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1266 prefixlen = cmd.tcpm_prefixlen;
1267 if (prefixlen > 32)
1268 return -EINVAL;
1269 }
1270
1271 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1272 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1273 struct net_device *dev;
1274
1275 rcu_read_lock();
1276 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1277 if (dev && netif_is_l3_master(dev))
1278 l3index = dev->ifindex;
1279
1280 rcu_read_unlock();
1281
1282 /* ok to reference set/not set outside of rcu;
1283 * right now device MUST be an L3 master
1284 */
1285 if (!dev || !l3index)
1286 return -EINVAL;
1287 }
1288
1289 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1290
1291 if (!cmd.tcpm_keylen)
1292 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1293
1294 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1295 return -EINVAL;
1296
1297 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1298 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1299 }
1300
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1301 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1302 __be32 daddr, __be32 saddr,
1303 const struct tcphdr *th, int nbytes)
1304 {
1305 struct tcp4_pseudohdr *bp;
1306 struct scatterlist sg;
1307 struct tcphdr *_th;
1308
1309 bp = hp->scratch;
1310 bp->saddr = saddr;
1311 bp->daddr = daddr;
1312 bp->pad = 0;
1313 bp->protocol = IPPROTO_TCP;
1314 bp->len = cpu_to_be16(nbytes);
1315
1316 _th = (struct tcphdr *)(bp + 1);
1317 memcpy(_th, th, sizeof(*th));
1318 _th->check = 0;
1319
1320 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1321 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1322 sizeof(*bp) + sizeof(*th));
1323 return crypto_ahash_update(hp->md5_req);
1324 }
1325
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1326 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1327 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1328 {
1329 struct tcp_md5sig_pool *hp;
1330 struct ahash_request *req;
1331
1332 hp = tcp_get_md5sig_pool();
1333 if (!hp)
1334 goto clear_hash_noput;
1335 req = hp->md5_req;
1336
1337 if (crypto_ahash_init(req))
1338 goto clear_hash;
1339 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1340 goto clear_hash;
1341 if (tcp_md5_hash_key(hp, key))
1342 goto clear_hash;
1343 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1344 if (crypto_ahash_final(req))
1345 goto clear_hash;
1346
1347 tcp_put_md5sig_pool();
1348 return 0;
1349
1350 clear_hash:
1351 tcp_put_md5sig_pool();
1352 clear_hash_noput:
1353 memset(md5_hash, 0, 16);
1354 return 1;
1355 }
1356
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1357 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1358 const struct sock *sk,
1359 const struct sk_buff *skb)
1360 {
1361 struct tcp_md5sig_pool *hp;
1362 struct ahash_request *req;
1363 const struct tcphdr *th = tcp_hdr(skb);
1364 __be32 saddr, daddr;
1365
1366 if (sk) { /* valid for establish/request sockets */
1367 saddr = sk->sk_rcv_saddr;
1368 daddr = sk->sk_daddr;
1369 } else {
1370 const struct iphdr *iph = ip_hdr(skb);
1371 saddr = iph->saddr;
1372 daddr = iph->daddr;
1373 }
1374
1375 hp = tcp_get_md5sig_pool();
1376 if (!hp)
1377 goto clear_hash_noput;
1378 req = hp->md5_req;
1379
1380 if (crypto_ahash_init(req))
1381 goto clear_hash;
1382
1383 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1384 goto clear_hash;
1385 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1386 goto clear_hash;
1387 if (tcp_md5_hash_key(hp, key))
1388 goto clear_hash;
1389 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1390 if (crypto_ahash_final(req))
1391 goto clear_hash;
1392
1393 tcp_put_md5sig_pool();
1394 return 0;
1395
1396 clear_hash:
1397 tcp_put_md5sig_pool();
1398 clear_hash_noput:
1399 memset(md5_hash, 0, 16);
1400 return 1;
1401 }
1402 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1403
1404 #endif
1405
1406 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1407 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1408 const struct sk_buff *skb,
1409 int dif, int sdif)
1410 {
1411 #ifdef CONFIG_TCP_MD5SIG
1412 /*
1413 * This gets called for each TCP segment that arrives
1414 * so we want to be efficient.
1415 * We have 3 drop cases:
1416 * o No MD5 hash and one expected.
1417 * o MD5 hash and we're not expecting one.
1418 * o MD5 hash and its wrong.
1419 */
1420 const __u8 *hash_location = NULL;
1421 struct tcp_md5sig_key *hash_expected;
1422 const struct iphdr *iph = ip_hdr(skb);
1423 const struct tcphdr *th = tcp_hdr(skb);
1424 const union tcp_md5_addr *addr;
1425 unsigned char newhash[16];
1426 int genhash, l3index;
1427
1428 /* sdif set, means packet ingressed via a device
1429 * in an L3 domain and dif is set to the l3mdev
1430 */
1431 l3index = sdif ? dif : 0;
1432
1433 addr = (union tcp_md5_addr *)&iph->saddr;
1434 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1435 hash_location = tcp_parse_md5sig_option(th);
1436
1437 /* We've parsed the options - do we have a hash? */
1438 if (!hash_expected && !hash_location)
1439 return false;
1440
1441 if (hash_expected && !hash_location) {
1442 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1443 return true;
1444 }
1445
1446 if (!hash_expected && hash_location) {
1447 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1448 return true;
1449 }
1450
1451 /* Okay, so this is hash_expected and hash_location -
1452 * so we need to calculate the checksum.
1453 */
1454 genhash = tcp_v4_md5_hash_skb(newhash,
1455 hash_expected,
1456 NULL, skb);
1457
1458 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1459 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1460 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1461 &iph->saddr, ntohs(th->source),
1462 &iph->daddr, ntohs(th->dest),
1463 genhash ? " tcp_v4_calc_md5_hash failed"
1464 : "", l3index);
1465 return true;
1466 }
1467 return false;
1468 #endif
1469 return false;
1470 }
1471
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1472 static void tcp_v4_init_req(struct request_sock *req,
1473 const struct sock *sk_listener,
1474 struct sk_buff *skb)
1475 {
1476 struct inet_request_sock *ireq = inet_rsk(req);
1477 struct net *net = sock_net(sk_listener);
1478
1479 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1480 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1481 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1482 }
1483
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1484 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1485 struct sk_buff *skb,
1486 struct flowi *fl,
1487 struct request_sock *req)
1488 {
1489 tcp_v4_init_req(req, sk, skb);
1490
1491 if (security_inet_conn_request(sk, skb, req))
1492 return NULL;
1493
1494 return inet_csk_route_req(sk, &fl->u.ip4, req);
1495 }
1496
1497 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1498 .family = PF_INET,
1499 .obj_size = sizeof(struct tcp_request_sock),
1500 .rtx_syn_ack = tcp_rtx_synack,
1501 .send_ack = tcp_v4_reqsk_send_ack,
1502 .destructor = tcp_v4_reqsk_destructor,
1503 .send_reset = tcp_v4_send_reset,
1504 .syn_ack_timeout = tcp_syn_ack_timeout,
1505 };
1506
1507 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1508 .mss_clamp = TCP_MSS_DEFAULT,
1509 #ifdef CONFIG_TCP_MD5SIG
1510 .req_md5_lookup = tcp_v4_md5_lookup,
1511 .calc_md5_hash = tcp_v4_md5_hash_skb,
1512 #endif
1513 #ifdef CONFIG_SYN_COOKIES
1514 .cookie_init_seq = cookie_v4_init_sequence,
1515 #endif
1516 .route_req = tcp_v4_route_req,
1517 .init_seq = tcp_v4_init_seq,
1518 .init_ts_off = tcp_v4_init_ts_off,
1519 .send_synack = tcp_v4_send_synack,
1520 };
1521
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1522 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1523 {
1524 /* Never answer to SYNs send to broadcast or multicast */
1525 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1526 goto drop;
1527
1528 return tcp_conn_request(&tcp_request_sock_ops,
1529 &tcp_request_sock_ipv4_ops, sk, skb);
1530
1531 drop:
1532 tcp_listendrop(sk);
1533 return 0;
1534 }
1535 EXPORT_SYMBOL(tcp_v4_conn_request);
1536
1537
1538 /*
1539 * The three way handshake has completed - we got a valid synack -
1540 * now create the new socket.
1541 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1542 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1543 struct request_sock *req,
1544 struct dst_entry *dst,
1545 struct request_sock *req_unhash,
1546 bool *own_req)
1547 {
1548 struct inet_request_sock *ireq;
1549 bool found_dup_sk = false;
1550 struct inet_sock *newinet;
1551 struct tcp_sock *newtp;
1552 struct sock *newsk;
1553 #ifdef CONFIG_TCP_MD5SIG
1554 const union tcp_md5_addr *addr;
1555 struct tcp_md5sig_key *key;
1556 int l3index;
1557 #endif
1558 struct ip_options_rcu *inet_opt;
1559
1560 if (sk_acceptq_is_full(sk))
1561 goto exit_overflow;
1562
1563 newsk = tcp_create_openreq_child(sk, req, skb);
1564 if (!newsk)
1565 goto exit_nonewsk;
1566
1567 newsk->sk_gso_type = SKB_GSO_TCPV4;
1568 inet_sk_rx_dst_set(newsk, skb);
1569
1570 newtp = tcp_sk(newsk);
1571 newinet = inet_sk(newsk);
1572 ireq = inet_rsk(req);
1573 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1574 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1575 newsk->sk_bound_dev_if = ireq->ir_iif;
1576 newinet->inet_saddr = ireq->ir_loc_addr;
1577 inet_opt = rcu_dereference(ireq->ireq_opt);
1578 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1579 newinet->mc_index = inet_iif(skb);
1580 newinet->mc_ttl = ip_hdr(skb)->ttl;
1581 newinet->rcv_tos = ip_hdr(skb)->tos;
1582 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1583 if (inet_opt)
1584 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1585 newinet->inet_id = prandom_u32();
1586
1587 /* Set ToS of the new socket based upon the value of incoming SYN.
1588 * ECT bits are set later in tcp_init_transfer().
1589 */
1590 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1591 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1592
1593 if (!dst) {
1594 dst = inet_csk_route_child_sock(sk, newsk, req);
1595 if (!dst)
1596 goto put_and_exit;
1597 } else {
1598 /* syncookie case : see end of cookie_v4_check() */
1599 }
1600 sk_setup_caps(newsk, dst);
1601
1602 tcp_ca_openreq_child(newsk, dst);
1603
1604 tcp_sync_mss(newsk, dst_mtu(dst));
1605 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1606
1607 tcp_initialize_rcv_mss(newsk);
1608
1609 #ifdef CONFIG_TCP_MD5SIG
1610 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1611 /* Copy over the MD5 key from the original socket */
1612 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1613 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1614 if (key) {
1615 /*
1616 * We're using one, so create a matching key
1617 * on the newsk structure. If we fail to get
1618 * memory, then we end up not copying the key
1619 * across. Shucks.
1620 */
1621 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1622 key->key, key->keylen, GFP_ATOMIC);
1623 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1624 }
1625 #endif
1626
1627 if (__inet_inherit_port(sk, newsk) < 0)
1628 goto put_and_exit;
1629 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1630 &found_dup_sk);
1631 if (likely(*own_req)) {
1632 tcp_move_syn(newtp, req);
1633 ireq->ireq_opt = NULL;
1634 } else {
1635 newinet->inet_opt = NULL;
1636
1637 if (!req_unhash && found_dup_sk) {
1638 /* This code path should only be executed in the
1639 * syncookie case only
1640 */
1641 bh_unlock_sock(newsk);
1642 sock_put(newsk);
1643 newsk = NULL;
1644 }
1645 }
1646 return newsk;
1647
1648 exit_overflow:
1649 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1650 exit_nonewsk:
1651 dst_release(dst);
1652 exit:
1653 tcp_listendrop(sk);
1654 return NULL;
1655 put_and_exit:
1656 newinet->inet_opt = NULL;
1657 inet_csk_prepare_forced_close(newsk);
1658 tcp_done(newsk);
1659 goto exit;
1660 }
1661 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1662
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1663 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1664 {
1665 #ifdef CONFIG_SYN_COOKIES
1666 const struct tcphdr *th = tcp_hdr(skb);
1667
1668 if (!th->syn)
1669 sk = cookie_v4_check(sk, skb);
1670 #endif
1671 return sk;
1672 }
1673
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1674 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1675 struct tcphdr *th, u32 *cookie)
1676 {
1677 u16 mss = 0;
1678 #ifdef CONFIG_SYN_COOKIES
1679 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1680 &tcp_request_sock_ipv4_ops, sk, th);
1681 if (mss) {
1682 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1683 tcp_synq_overflow(sk);
1684 }
1685 #endif
1686 return mss;
1687 }
1688
1689 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1690 u32));
1691 /* The socket must have it's spinlock held when we get
1692 * here, unless it is a TCP_LISTEN socket.
1693 *
1694 * We have a potential double-lock case here, so even when
1695 * doing backlog processing we use the BH locking scheme.
1696 * This is because we cannot sleep with the original spinlock
1697 * held.
1698 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1699 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1700 {
1701 struct sock *rsk;
1702
1703 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1704 struct dst_entry *dst;
1705
1706 dst = rcu_dereference_protected(sk->sk_rx_dst,
1707 lockdep_sock_is_held(sk));
1708
1709 sock_rps_save_rxhash(sk, skb);
1710 sk_mark_napi_id(sk, skb);
1711 if (dst) {
1712 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1713 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1714 dst, 0)) {
1715 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1716 dst_release(dst);
1717 }
1718 }
1719 tcp_rcv_established(sk, skb);
1720 return 0;
1721 }
1722
1723 if (tcp_checksum_complete(skb))
1724 goto csum_err;
1725
1726 if (sk->sk_state == TCP_LISTEN) {
1727 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1728
1729 if (!nsk)
1730 goto discard;
1731 if (nsk != sk) {
1732 if (tcp_child_process(sk, nsk, skb)) {
1733 rsk = nsk;
1734 goto reset;
1735 }
1736 return 0;
1737 }
1738 } else
1739 sock_rps_save_rxhash(sk, skb);
1740
1741 if (tcp_rcv_state_process(sk, skb)) {
1742 rsk = sk;
1743 goto reset;
1744 }
1745 return 0;
1746
1747 reset:
1748 tcp_v4_send_reset(rsk, skb);
1749 discard:
1750 kfree_skb(skb);
1751 /* Be careful here. If this function gets more complicated and
1752 * gcc suffers from register pressure on the x86, sk (in %ebx)
1753 * might be destroyed here. This current version compiles correctly,
1754 * but you have been warned.
1755 */
1756 return 0;
1757
1758 csum_err:
1759 trace_tcp_bad_csum(skb);
1760 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1761 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1762 goto discard;
1763 }
1764 EXPORT_SYMBOL(tcp_v4_do_rcv);
1765
tcp_v4_early_demux(struct sk_buff * skb)1766 int tcp_v4_early_demux(struct sk_buff *skb)
1767 {
1768 const struct iphdr *iph;
1769 const struct tcphdr *th;
1770 struct sock *sk;
1771
1772 if (skb->pkt_type != PACKET_HOST)
1773 return 0;
1774
1775 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1776 return 0;
1777
1778 iph = ip_hdr(skb);
1779 th = tcp_hdr(skb);
1780
1781 if (th->doff < sizeof(struct tcphdr) / 4)
1782 return 0;
1783
1784 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1785 iph->saddr, th->source,
1786 iph->daddr, ntohs(th->dest),
1787 skb->skb_iif, inet_sdif(skb));
1788 if (sk) {
1789 skb->sk = sk;
1790 skb->destructor = sock_edemux;
1791 if (sk_fullsock(sk)) {
1792 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1793
1794 if (dst)
1795 dst = dst_check(dst, 0);
1796 if (dst &&
1797 sk->sk_rx_dst_ifindex == skb->skb_iif)
1798 skb_dst_set_noref(skb, dst);
1799 }
1800 }
1801 return 0;
1802 }
1803
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1804 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1805 {
1806 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1807 u32 tail_gso_size, tail_gso_segs;
1808 struct skb_shared_info *shinfo;
1809 const struct tcphdr *th;
1810 struct tcphdr *thtail;
1811 struct sk_buff *tail;
1812 unsigned int hdrlen;
1813 bool fragstolen;
1814 u32 gso_segs;
1815 u32 gso_size;
1816 int delta;
1817
1818 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1819 * we can fix skb->truesize to its real value to avoid future drops.
1820 * This is valid because skb is not yet charged to the socket.
1821 * It has been noticed pure SACK packets were sometimes dropped
1822 * (if cooked by drivers without copybreak feature).
1823 */
1824 skb_condense(skb);
1825
1826 skb_dst_drop(skb);
1827
1828 if (unlikely(tcp_checksum_complete(skb))) {
1829 bh_unlock_sock(sk);
1830 trace_tcp_bad_csum(skb);
1831 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1832 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1833 return true;
1834 }
1835
1836 /* Attempt coalescing to last skb in backlog, even if we are
1837 * above the limits.
1838 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1839 */
1840 th = (const struct tcphdr *)skb->data;
1841 hdrlen = th->doff * 4;
1842
1843 tail = sk->sk_backlog.tail;
1844 if (!tail)
1845 goto no_coalesce;
1846 thtail = (struct tcphdr *)tail->data;
1847
1848 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1849 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1850 ((TCP_SKB_CB(tail)->tcp_flags |
1851 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1852 !((TCP_SKB_CB(tail)->tcp_flags &
1853 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1854 ((TCP_SKB_CB(tail)->tcp_flags ^
1855 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1856 #ifdef CONFIG_TLS_DEVICE
1857 tail->decrypted != skb->decrypted ||
1858 #endif
1859 thtail->doff != th->doff ||
1860 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1861 goto no_coalesce;
1862
1863 __skb_pull(skb, hdrlen);
1864
1865 shinfo = skb_shinfo(skb);
1866 gso_size = shinfo->gso_size ?: skb->len;
1867 gso_segs = shinfo->gso_segs ?: 1;
1868
1869 shinfo = skb_shinfo(tail);
1870 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1871 tail_gso_segs = shinfo->gso_segs ?: 1;
1872
1873 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1874 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1875
1876 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1877 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1878 thtail->window = th->window;
1879 }
1880
1881 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1882 * thtail->fin, so that the fast path in tcp_rcv_established()
1883 * is not entered if we append a packet with a FIN.
1884 * SYN, RST, URG are not present.
1885 * ACK is set on both packets.
1886 * PSH : we do not really care in TCP stack,
1887 * at least for 'GRO' packets.
1888 */
1889 thtail->fin |= th->fin;
1890 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1891
1892 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1893 TCP_SKB_CB(tail)->has_rxtstamp = true;
1894 tail->tstamp = skb->tstamp;
1895 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1896 }
1897
1898 /* Not as strict as GRO. We only need to carry mss max value */
1899 shinfo->gso_size = max(gso_size, tail_gso_size);
1900 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1901
1902 sk->sk_backlog.len += delta;
1903 __NET_INC_STATS(sock_net(sk),
1904 LINUX_MIB_TCPBACKLOGCOALESCE);
1905 kfree_skb_partial(skb, fragstolen);
1906 return false;
1907 }
1908 __skb_push(skb, hdrlen);
1909
1910 no_coalesce:
1911 /* Only socket owner can try to collapse/prune rx queues
1912 * to reduce memory overhead, so add a little headroom here.
1913 * Few sockets backlog are possibly concurrently non empty.
1914 */
1915 limit += 64*1024;
1916
1917 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1918 bh_unlock_sock(sk);
1919 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1920 return true;
1921 }
1922 return false;
1923 }
1924 EXPORT_SYMBOL(tcp_add_backlog);
1925
tcp_filter(struct sock * sk,struct sk_buff * skb)1926 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1927 {
1928 struct tcphdr *th = (struct tcphdr *)skb->data;
1929
1930 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1931 }
1932 EXPORT_SYMBOL(tcp_filter);
1933
tcp_v4_restore_cb(struct sk_buff * skb)1934 static void tcp_v4_restore_cb(struct sk_buff *skb)
1935 {
1936 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1937 sizeof(struct inet_skb_parm));
1938 }
1939
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1940 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1941 const struct tcphdr *th)
1942 {
1943 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1944 * barrier() makes sure compiler wont play fool^Waliasing games.
1945 */
1946 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1947 sizeof(struct inet_skb_parm));
1948 barrier();
1949
1950 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1951 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1952 skb->len - th->doff * 4);
1953 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1954 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1955 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1956 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1957 TCP_SKB_CB(skb)->sacked = 0;
1958 TCP_SKB_CB(skb)->has_rxtstamp =
1959 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1960 }
1961
1962 /*
1963 * From tcp_input.c
1964 */
1965
tcp_v4_rcv(struct sk_buff * skb)1966 int tcp_v4_rcv(struct sk_buff *skb)
1967 {
1968 struct net *net = dev_net(skb->dev);
1969 int sdif = inet_sdif(skb);
1970 int dif = inet_iif(skb);
1971 const struct iphdr *iph;
1972 const struct tcphdr *th;
1973 bool refcounted;
1974 struct sock *sk;
1975 int ret;
1976
1977 if (skb->pkt_type != PACKET_HOST)
1978 goto discard_it;
1979
1980 /* Count it even if it's bad */
1981 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1982
1983 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1984 goto discard_it;
1985
1986 th = (const struct tcphdr *)skb->data;
1987
1988 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1989 goto bad_packet;
1990 if (!pskb_may_pull(skb, th->doff * 4))
1991 goto discard_it;
1992
1993 /* An explanation is required here, I think.
1994 * Packet length and doff are validated by header prediction,
1995 * provided case of th->doff==0 is eliminated.
1996 * So, we defer the checks. */
1997
1998 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1999 goto csum_error;
2000
2001 th = (const struct tcphdr *)skb->data;
2002 iph = ip_hdr(skb);
2003 lookup:
2004 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2005 th->dest, sdif, &refcounted);
2006 if (!sk)
2007 goto no_tcp_socket;
2008
2009 process:
2010 if (sk->sk_state == TCP_TIME_WAIT)
2011 goto do_time_wait;
2012
2013 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2014 struct request_sock *req = inet_reqsk(sk);
2015 bool req_stolen = false;
2016 struct sock *nsk;
2017
2018 sk = req->rsk_listener;
2019 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2020 sk_drops_add(sk, skb);
2021 reqsk_put(req);
2022 goto discard_it;
2023 }
2024 if (tcp_checksum_complete(skb)) {
2025 reqsk_put(req);
2026 goto csum_error;
2027 }
2028 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2029 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2030 if (!nsk) {
2031 inet_csk_reqsk_queue_drop_and_put(sk, req);
2032 goto lookup;
2033 }
2034 sk = nsk;
2035 /* reuseport_migrate_sock() has already held one sk_refcnt
2036 * before returning.
2037 */
2038 } else {
2039 /* We own a reference on the listener, increase it again
2040 * as we might lose it too soon.
2041 */
2042 sock_hold(sk);
2043 }
2044 refcounted = true;
2045 nsk = NULL;
2046 if (!tcp_filter(sk, skb)) {
2047 th = (const struct tcphdr *)skb->data;
2048 iph = ip_hdr(skb);
2049 tcp_v4_fill_cb(skb, iph, th);
2050 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2051 }
2052 if (!nsk) {
2053 reqsk_put(req);
2054 if (req_stolen) {
2055 /* Another cpu got exclusive access to req
2056 * and created a full blown socket.
2057 * Try to feed this packet to this socket
2058 * instead of discarding it.
2059 */
2060 tcp_v4_restore_cb(skb);
2061 sock_put(sk);
2062 goto lookup;
2063 }
2064 goto discard_and_relse;
2065 }
2066 if (nsk == sk) {
2067 reqsk_put(req);
2068 tcp_v4_restore_cb(skb);
2069 } else if (tcp_child_process(sk, nsk, skb)) {
2070 tcp_v4_send_reset(nsk, skb);
2071 goto discard_and_relse;
2072 } else {
2073 sock_put(sk);
2074 return 0;
2075 }
2076 }
2077
2078 if (static_branch_unlikely(&ip4_min_ttl)) {
2079 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2080 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2081 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2082 goto discard_and_relse;
2083 }
2084 }
2085
2086 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2087 goto discard_and_relse;
2088
2089 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2090 goto discard_and_relse;
2091
2092 nf_reset_ct(skb);
2093
2094 if (tcp_filter(sk, skb))
2095 goto discard_and_relse;
2096 th = (const struct tcphdr *)skb->data;
2097 iph = ip_hdr(skb);
2098 tcp_v4_fill_cb(skb, iph, th);
2099
2100 skb->dev = NULL;
2101
2102 if (sk->sk_state == TCP_LISTEN) {
2103 ret = tcp_v4_do_rcv(sk, skb);
2104 goto put_and_return;
2105 }
2106
2107 sk_incoming_cpu_update(sk);
2108
2109 bh_lock_sock_nested(sk);
2110 tcp_segs_in(tcp_sk(sk), skb);
2111 ret = 0;
2112 if (!sock_owned_by_user(sk)) {
2113 ret = tcp_v4_do_rcv(sk, skb);
2114 } else {
2115 if (tcp_add_backlog(sk, skb))
2116 goto discard_and_relse;
2117 }
2118 bh_unlock_sock(sk);
2119
2120 put_and_return:
2121 if (refcounted)
2122 sock_put(sk);
2123
2124 return ret;
2125
2126 no_tcp_socket:
2127 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2128 goto discard_it;
2129
2130 tcp_v4_fill_cb(skb, iph, th);
2131
2132 if (tcp_checksum_complete(skb)) {
2133 csum_error:
2134 trace_tcp_bad_csum(skb);
2135 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2136 bad_packet:
2137 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2138 } else {
2139 tcp_v4_send_reset(NULL, skb);
2140 }
2141
2142 discard_it:
2143 /* Discard frame. */
2144 kfree_skb(skb);
2145 return 0;
2146
2147 discard_and_relse:
2148 sk_drops_add(sk, skb);
2149 if (refcounted)
2150 sock_put(sk);
2151 goto discard_it;
2152
2153 do_time_wait:
2154 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2155 inet_twsk_put(inet_twsk(sk));
2156 goto discard_it;
2157 }
2158
2159 tcp_v4_fill_cb(skb, iph, th);
2160
2161 if (tcp_checksum_complete(skb)) {
2162 inet_twsk_put(inet_twsk(sk));
2163 goto csum_error;
2164 }
2165 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2166 case TCP_TW_SYN: {
2167 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2168 &tcp_hashinfo, skb,
2169 __tcp_hdrlen(th),
2170 iph->saddr, th->source,
2171 iph->daddr, th->dest,
2172 inet_iif(skb),
2173 sdif);
2174 if (sk2) {
2175 inet_twsk_deschedule_put(inet_twsk(sk));
2176 sk = sk2;
2177 tcp_v4_restore_cb(skb);
2178 refcounted = false;
2179 goto process;
2180 }
2181 }
2182 /* to ACK */
2183 fallthrough;
2184 case TCP_TW_ACK:
2185 tcp_v4_timewait_ack(sk, skb);
2186 break;
2187 case TCP_TW_RST:
2188 tcp_v4_send_reset(sk, skb);
2189 inet_twsk_deschedule_put(inet_twsk(sk));
2190 goto discard_it;
2191 case TCP_TW_SUCCESS:;
2192 }
2193 goto discard_it;
2194 }
2195
2196 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2197 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2198 .twsk_unique = tcp_twsk_unique,
2199 .twsk_destructor= tcp_twsk_destructor,
2200 };
2201
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2202 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2203 {
2204 struct dst_entry *dst = skb_dst(skb);
2205
2206 if (dst && dst_hold_safe(dst)) {
2207 rcu_assign_pointer(sk->sk_rx_dst, dst);
2208 sk->sk_rx_dst_ifindex = skb->skb_iif;
2209 }
2210 }
2211 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2212
2213 const struct inet_connection_sock_af_ops ipv4_specific = {
2214 .queue_xmit = ip_queue_xmit,
2215 .send_check = tcp_v4_send_check,
2216 .rebuild_header = inet_sk_rebuild_header,
2217 .sk_rx_dst_set = inet_sk_rx_dst_set,
2218 .conn_request = tcp_v4_conn_request,
2219 .syn_recv_sock = tcp_v4_syn_recv_sock,
2220 .net_header_len = sizeof(struct iphdr),
2221 .setsockopt = ip_setsockopt,
2222 .getsockopt = ip_getsockopt,
2223 .addr2sockaddr = inet_csk_addr2sockaddr,
2224 .sockaddr_len = sizeof(struct sockaddr_in),
2225 .mtu_reduced = tcp_v4_mtu_reduced,
2226 };
2227 EXPORT_SYMBOL(ipv4_specific);
2228
2229 #ifdef CONFIG_TCP_MD5SIG
2230 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2231 .md5_lookup = tcp_v4_md5_lookup,
2232 .calc_md5_hash = tcp_v4_md5_hash_skb,
2233 .md5_parse = tcp_v4_parse_md5_keys,
2234 };
2235 #endif
2236
2237 /* NOTE: A lot of things set to zero explicitly by call to
2238 * sk_alloc() so need not be done here.
2239 */
tcp_v4_init_sock(struct sock * sk)2240 static int tcp_v4_init_sock(struct sock *sk)
2241 {
2242 struct inet_connection_sock *icsk = inet_csk(sk);
2243
2244 tcp_init_sock(sk);
2245
2246 icsk->icsk_af_ops = &ipv4_specific;
2247
2248 #ifdef CONFIG_TCP_MD5SIG
2249 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2250 #endif
2251
2252 return 0;
2253 }
2254
tcp_v4_destroy_sock(struct sock * sk)2255 void tcp_v4_destroy_sock(struct sock *sk)
2256 {
2257 struct tcp_sock *tp = tcp_sk(sk);
2258
2259 trace_tcp_destroy_sock(sk);
2260
2261 tcp_clear_xmit_timers(sk);
2262
2263 tcp_cleanup_congestion_control(sk);
2264
2265 tcp_cleanup_ulp(sk);
2266
2267 /* Cleanup up the write buffer. */
2268 tcp_write_queue_purge(sk);
2269
2270 /* Check if we want to disable active TFO */
2271 tcp_fastopen_active_disable_ofo_check(sk);
2272
2273 /* Cleans up our, hopefully empty, out_of_order_queue. */
2274 skb_rbtree_purge(&tp->out_of_order_queue);
2275
2276 #ifdef CONFIG_TCP_MD5SIG
2277 /* Clean up the MD5 key list, if any */
2278 if (tp->md5sig_info) {
2279 tcp_clear_md5_list(sk);
2280 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2281 tp->md5sig_info = NULL;
2282 }
2283 #endif
2284
2285 /* Clean up a referenced TCP bind bucket. */
2286 if (inet_csk(sk)->icsk_bind_hash)
2287 inet_put_port(sk);
2288
2289 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2290
2291 /* If socket is aborted during connect operation */
2292 tcp_free_fastopen_req(tp);
2293 tcp_fastopen_destroy_cipher(sk);
2294 tcp_saved_syn_free(tp);
2295
2296 sk_sockets_allocated_dec(sk);
2297 }
2298 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2299
2300 #ifdef CONFIG_PROC_FS
2301 /* Proc filesystem TCP sock list dumping. */
2302
2303 static unsigned short seq_file_family(const struct seq_file *seq);
2304
seq_sk_match(struct seq_file * seq,const struct sock * sk)2305 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2306 {
2307 unsigned short family = seq_file_family(seq);
2308
2309 /* AF_UNSPEC is used as a match all */
2310 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2311 net_eq(sock_net(sk), seq_file_net(seq)));
2312 }
2313
2314 /* Find a non empty bucket (starting from st->bucket)
2315 * and return the first sk from it.
2316 */
listening_get_first(struct seq_file * seq)2317 static void *listening_get_first(struct seq_file *seq)
2318 {
2319 struct tcp_iter_state *st = seq->private;
2320
2321 st->offset = 0;
2322 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2323 struct inet_listen_hashbucket *ilb2;
2324 struct inet_connection_sock *icsk;
2325 struct sock *sk;
2326
2327 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2328 if (hlist_empty(&ilb2->head))
2329 continue;
2330
2331 spin_lock(&ilb2->lock);
2332 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2333 sk = (struct sock *)icsk;
2334 if (seq_sk_match(seq, sk))
2335 return sk;
2336 }
2337 spin_unlock(&ilb2->lock);
2338 }
2339
2340 return NULL;
2341 }
2342
2343 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2344 * If "cur" is the last one in the st->bucket,
2345 * call listening_get_first() to return the first sk of the next
2346 * non empty bucket.
2347 */
listening_get_next(struct seq_file * seq,void * cur)2348 static void *listening_get_next(struct seq_file *seq, void *cur)
2349 {
2350 struct tcp_iter_state *st = seq->private;
2351 struct inet_listen_hashbucket *ilb2;
2352 struct inet_connection_sock *icsk;
2353 struct sock *sk = cur;
2354
2355 ++st->num;
2356 ++st->offset;
2357
2358 icsk = inet_csk(sk);
2359 inet_lhash2_for_each_icsk_continue(icsk) {
2360 sk = (struct sock *)icsk;
2361 if (seq_sk_match(seq, sk))
2362 return sk;
2363 }
2364
2365 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2366 spin_unlock(&ilb2->lock);
2367 ++st->bucket;
2368 return listening_get_first(seq);
2369 }
2370
listening_get_idx(struct seq_file * seq,loff_t * pos)2371 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2372 {
2373 struct tcp_iter_state *st = seq->private;
2374 void *rc;
2375
2376 st->bucket = 0;
2377 st->offset = 0;
2378 rc = listening_get_first(seq);
2379
2380 while (rc && *pos) {
2381 rc = listening_get_next(seq, rc);
2382 --*pos;
2383 }
2384 return rc;
2385 }
2386
empty_bucket(const struct tcp_iter_state * st)2387 static inline bool empty_bucket(const struct tcp_iter_state *st)
2388 {
2389 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2390 }
2391
2392 /*
2393 * Get first established socket starting from bucket given in st->bucket.
2394 * If st->bucket is zero, the very first socket in the hash is returned.
2395 */
established_get_first(struct seq_file * seq)2396 static void *established_get_first(struct seq_file *seq)
2397 {
2398 struct tcp_iter_state *st = seq->private;
2399
2400 st->offset = 0;
2401 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2402 struct sock *sk;
2403 struct hlist_nulls_node *node;
2404 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2405
2406 /* Lockless fast path for the common case of empty buckets */
2407 if (empty_bucket(st))
2408 continue;
2409
2410 spin_lock_bh(lock);
2411 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2412 if (seq_sk_match(seq, sk))
2413 return sk;
2414 }
2415 spin_unlock_bh(lock);
2416 }
2417
2418 return NULL;
2419 }
2420
established_get_next(struct seq_file * seq,void * cur)2421 static void *established_get_next(struct seq_file *seq, void *cur)
2422 {
2423 struct sock *sk = cur;
2424 struct hlist_nulls_node *node;
2425 struct tcp_iter_state *st = seq->private;
2426
2427 ++st->num;
2428 ++st->offset;
2429
2430 sk = sk_nulls_next(sk);
2431
2432 sk_nulls_for_each_from(sk, node) {
2433 if (seq_sk_match(seq, sk))
2434 return sk;
2435 }
2436
2437 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2438 ++st->bucket;
2439 return established_get_first(seq);
2440 }
2441
established_get_idx(struct seq_file * seq,loff_t pos)2442 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2443 {
2444 struct tcp_iter_state *st = seq->private;
2445 void *rc;
2446
2447 st->bucket = 0;
2448 rc = established_get_first(seq);
2449
2450 while (rc && pos) {
2451 rc = established_get_next(seq, rc);
2452 --pos;
2453 }
2454 return rc;
2455 }
2456
tcp_get_idx(struct seq_file * seq,loff_t pos)2457 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2458 {
2459 void *rc;
2460 struct tcp_iter_state *st = seq->private;
2461
2462 st->state = TCP_SEQ_STATE_LISTENING;
2463 rc = listening_get_idx(seq, &pos);
2464
2465 if (!rc) {
2466 st->state = TCP_SEQ_STATE_ESTABLISHED;
2467 rc = established_get_idx(seq, pos);
2468 }
2469
2470 return rc;
2471 }
2472
tcp_seek_last_pos(struct seq_file * seq)2473 static void *tcp_seek_last_pos(struct seq_file *seq)
2474 {
2475 struct tcp_iter_state *st = seq->private;
2476 int bucket = st->bucket;
2477 int offset = st->offset;
2478 int orig_num = st->num;
2479 void *rc = NULL;
2480
2481 switch (st->state) {
2482 case TCP_SEQ_STATE_LISTENING:
2483 if (st->bucket > tcp_hashinfo.lhash2_mask)
2484 break;
2485 st->state = TCP_SEQ_STATE_LISTENING;
2486 rc = listening_get_first(seq);
2487 while (offset-- && rc && bucket == st->bucket)
2488 rc = listening_get_next(seq, rc);
2489 if (rc)
2490 break;
2491 st->bucket = 0;
2492 st->state = TCP_SEQ_STATE_ESTABLISHED;
2493 fallthrough;
2494 case TCP_SEQ_STATE_ESTABLISHED:
2495 if (st->bucket > tcp_hashinfo.ehash_mask)
2496 break;
2497 rc = established_get_first(seq);
2498 while (offset-- && rc && bucket == st->bucket)
2499 rc = established_get_next(seq, rc);
2500 }
2501
2502 st->num = orig_num;
2503
2504 return rc;
2505 }
2506
tcp_seq_start(struct seq_file * seq,loff_t * pos)2507 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2508 {
2509 struct tcp_iter_state *st = seq->private;
2510 void *rc;
2511
2512 if (*pos && *pos == st->last_pos) {
2513 rc = tcp_seek_last_pos(seq);
2514 if (rc)
2515 goto out;
2516 }
2517
2518 st->state = TCP_SEQ_STATE_LISTENING;
2519 st->num = 0;
2520 st->bucket = 0;
2521 st->offset = 0;
2522 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2523
2524 out:
2525 st->last_pos = *pos;
2526 return rc;
2527 }
2528 EXPORT_SYMBOL(tcp_seq_start);
2529
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2530 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531 {
2532 struct tcp_iter_state *st = seq->private;
2533 void *rc = NULL;
2534
2535 if (v == SEQ_START_TOKEN) {
2536 rc = tcp_get_idx(seq, 0);
2537 goto out;
2538 }
2539
2540 switch (st->state) {
2541 case TCP_SEQ_STATE_LISTENING:
2542 rc = listening_get_next(seq, v);
2543 if (!rc) {
2544 st->state = TCP_SEQ_STATE_ESTABLISHED;
2545 st->bucket = 0;
2546 st->offset = 0;
2547 rc = established_get_first(seq);
2548 }
2549 break;
2550 case TCP_SEQ_STATE_ESTABLISHED:
2551 rc = established_get_next(seq, v);
2552 break;
2553 }
2554 out:
2555 ++*pos;
2556 st->last_pos = *pos;
2557 return rc;
2558 }
2559 EXPORT_SYMBOL(tcp_seq_next);
2560
tcp_seq_stop(struct seq_file * seq,void * v)2561 void tcp_seq_stop(struct seq_file *seq, void *v)
2562 {
2563 struct tcp_iter_state *st = seq->private;
2564
2565 switch (st->state) {
2566 case TCP_SEQ_STATE_LISTENING:
2567 if (v != SEQ_START_TOKEN)
2568 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2569 break;
2570 case TCP_SEQ_STATE_ESTABLISHED:
2571 if (v)
2572 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2573 break;
2574 }
2575 }
2576 EXPORT_SYMBOL(tcp_seq_stop);
2577
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2578 static void get_openreq4(const struct request_sock *req,
2579 struct seq_file *f, int i)
2580 {
2581 const struct inet_request_sock *ireq = inet_rsk(req);
2582 long delta = req->rsk_timer.expires - jiffies;
2583
2584 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2585 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2586 i,
2587 ireq->ir_loc_addr,
2588 ireq->ir_num,
2589 ireq->ir_rmt_addr,
2590 ntohs(ireq->ir_rmt_port),
2591 TCP_SYN_RECV,
2592 0, 0, /* could print option size, but that is af dependent. */
2593 1, /* timers active (only the expire timer) */
2594 jiffies_delta_to_clock_t(delta),
2595 req->num_timeout,
2596 from_kuid_munged(seq_user_ns(f),
2597 sock_i_uid(req->rsk_listener)),
2598 0, /* non standard timer */
2599 0, /* open_requests have no inode */
2600 0,
2601 req);
2602 }
2603
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2604 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2605 {
2606 int timer_active;
2607 unsigned long timer_expires;
2608 const struct tcp_sock *tp = tcp_sk(sk);
2609 const struct inet_connection_sock *icsk = inet_csk(sk);
2610 const struct inet_sock *inet = inet_sk(sk);
2611 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2612 __be32 dest = inet->inet_daddr;
2613 __be32 src = inet->inet_rcv_saddr;
2614 __u16 destp = ntohs(inet->inet_dport);
2615 __u16 srcp = ntohs(inet->inet_sport);
2616 int rx_queue;
2617 int state;
2618
2619 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2620 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2621 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2622 timer_active = 1;
2623 timer_expires = icsk->icsk_timeout;
2624 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2625 timer_active = 4;
2626 timer_expires = icsk->icsk_timeout;
2627 } else if (timer_pending(&sk->sk_timer)) {
2628 timer_active = 2;
2629 timer_expires = sk->sk_timer.expires;
2630 } else {
2631 timer_active = 0;
2632 timer_expires = jiffies;
2633 }
2634
2635 state = inet_sk_state_load(sk);
2636 if (state == TCP_LISTEN)
2637 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2638 else
2639 /* Because we don't lock the socket,
2640 * we might find a transient negative value.
2641 */
2642 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2643 READ_ONCE(tp->copied_seq), 0);
2644
2645 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2646 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2647 i, src, srcp, dest, destp, state,
2648 READ_ONCE(tp->write_seq) - tp->snd_una,
2649 rx_queue,
2650 timer_active,
2651 jiffies_delta_to_clock_t(timer_expires - jiffies),
2652 icsk->icsk_retransmits,
2653 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2654 icsk->icsk_probes_out,
2655 sock_i_ino(sk),
2656 refcount_read(&sk->sk_refcnt), sk,
2657 jiffies_to_clock_t(icsk->icsk_rto),
2658 jiffies_to_clock_t(icsk->icsk_ack.ato),
2659 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2660 tp->snd_cwnd,
2661 state == TCP_LISTEN ?
2662 fastopenq->max_qlen :
2663 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2664 }
2665
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2666 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2667 struct seq_file *f, int i)
2668 {
2669 long delta = tw->tw_timer.expires - jiffies;
2670 __be32 dest, src;
2671 __u16 destp, srcp;
2672
2673 dest = tw->tw_daddr;
2674 src = tw->tw_rcv_saddr;
2675 destp = ntohs(tw->tw_dport);
2676 srcp = ntohs(tw->tw_sport);
2677
2678 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2679 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2680 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2681 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2682 refcount_read(&tw->tw_refcnt), tw);
2683 }
2684
2685 #define TMPSZ 150
2686
tcp4_seq_show(struct seq_file * seq,void * v)2687 static int tcp4_seq_show(struct seq_file *seq, void *v)
2688 {
2689 struct tcp_iter_state *st;
2690 struct sock *sk = v;
2691
2692 seq_setwidth(seq, TMPSZ - 1);
2693 if (v == SEQ_START_TOKEN) {
2694 seq_puts(seq, " sl local_address rem_address st tx_queue "
2695 "rx_queue tr tm->when retrnsmt uid timeout "
2696 "inode");
2697 goto out;
2698 }
2699 st = seq->private;
2700
2701 if (sk->sk_state == TCP_TIME_WAIT)
2702 get_timewait4_sock(v, seq, st->num);
2703 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2704 get_openreq4(v, seq, st->num);
2705 else
2706 get_tcp4_sock(v, seq, st->num);
2707 out:
2708 seq_pad(seq, '\n');
2709 return 0;
2710 }
2711
2712 #ifdef CONFIG_BPF_SYSCALL
2713 struct bpf_tcp_iter_state {
2714 struct tcp_iter_state state;
2715 unsigned int cur_sk;
2716 unsigned int end_sk;
2717 unsigned int max_sk;
2718 struct sock **batch;
2719 bool st_bucket_done;
2720 };
2721
2722 struct bpf_iter__tcp {
2723 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2724 __bpf_md_ptr(struct sock_common *, sk_common);
2725 uid_t uid __aligned(8);
2726 };
2727
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2728 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2729 struct sock_common *sk_common, uid_t uid)
2730 {
2731 struct bpf_iter__tcp ctx;
2732
2733 meta->seq_num--; /* skip SEQ_START_TOKEN */
2734 ctx.meta = meta;
2735 ctx.sk_common = sk_common;
2736 ctx.uid = uid;
2737 return bpf_iter_run_prog(prog, &ctx);
2738 }
2739
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2740 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2741 {
2742 while (iter->cur_sk < iter->end_sk)
2743 sock_put(iter->batch[iter->cur_sk++]);
2744 }
2745
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2746 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2747 unsigned int new_batch_sz)
2748 {
2749 struct sock **new_batch;
2750
2751 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2752 GFP_USER | __GFP_NOWARN);
2753 if (!new_batch)
2754 return -ENOMEM;
2755
2756 bpf_iter_tcp_put_batch(iter);
2757 kvfree(iter->batch);
2758 iter->batch = new_batch;
2759 iter->max_sk = new_batch_sz;
2760
2761 return 0;
2762 }
2763
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2764 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2765 struct sock *start_sk)
2766 {
2767 struct bpf_tcp_iter_state *iter = seq->private;
2768 struct tcp_iter_state *st = &iter->state;
2769 struct inet_connection_sock *icsk;
2770 unsigned int expected = 1;
2771 struct sock *sk;
2772
2773 sock_hold(start_sk);
2774 iter->batch[iter->end_sk++] = start_sk;
2775
2776 icsk = inet_csk(start_sk);
2777 inet_lhash2_for_each_icsk_continue(icsk) {
2778 sk = (struct sock *)icsk;
2779 if (seq_sk_match(seq, sk)) {
2780 if (iter->end_sk < iter->max_sk) {
2781 sock_hold(sk);
2782 iter->batch[iter->end_sk++] = sk;
2783 }
2784 expected++;
2785 }
2786 }
2787 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2788
2789 return expected;
2790 }
2791
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2792 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2793 struct sock *start_sk)
2794 {
2795 struct bpf_tcp_iter_state *iter = seq->private;
2796 struct tcp_iter_state *st = &iter->state;
2797 struct hlist_nulls_node *node;
2798 unsigned int expected = 1;
2799 struct sock *sk;
2800
2801 sock_hold(start_sk);
2802 iter->batch[iter->end_sk++] = start_sk;
2803
2804 sk = sk_nulls_next(start_sk);
2805 sk_nulls_for_each_from(sk, node) {
2806 if (seq_sk_match(seq, sk)) {
2807 if (iter->end_sk < iter->max_sk) {
2808 sock_hold(sk);
2809 iter->batch[iter->end_sk++] = sk;
2810 }
2811 expected++;
2812 }
2813 }
2814 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2815
2816 return expected;
2817 }
2818
bpf_iter_tcp_batch(struct seq_file * seq)2819 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2820 {
2821 struct bpf_tcp_iter_state *iter = seq->private;
2822 struct tcp_iter_state *st = &iter->state;
2823 unsigned int expected;
2824 bool resized = false;
2825 struct sock *sk;
2826
2827 /* The st->bucket is done. Directly advance to the next
2828 * bucket instead of having the tcp_seek_last_pos() to skip
2829 * one by one in the current bucket and eventually find out
2830 * it has to advance to the next bucket.
2831 */
2832 if (iter->st_bucket_done) {
2833 st->offset = 0;
2834 st->bucket++;
2835 if (st->state == TCP_SEQ_STATE_LISTENING &&
2836 st->bucket > tcp_hashinfo.lhash2_mask) {
2837 st->state = TCP_SEQ_STATE_ESTABLISHED;
2838 st->bucket = 0;
2839 }
2840 }
2841
2842 again:
2843 /* Get a new batch */
2844 iter->cur_sk = 0;
2845 iter->end_sk = 0;
2846 iter->st_bucket_done = false;
2847
2848 sk = tcp_seek_last_pos(seq);
2849 if (!sk)
2850 return NULL; /* Done */
2851
2852 if (st->state == TCP_SEQ_STATE_LISTENING)
2853 expected = bpf_iter_tcp_listening_batch(seq, sk);
2854 else
2855 expected = bpf_iter_tcp_established_batch(seq, sk);
2856
2857 if (iter->end_sk == expected) {
2858 iter->st_bucket_done = true;
2859 return sk;
2860 }
2861
2862 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2863 resized = true;
2864 goto again;
2865 }
2866
2867 return sk;
2868 }
2869
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2870 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2871 {
2872 /* bpf iter does not support lseek, so it always
2873 * continue from where it was stop()-ped.
2874 */
2875 if (*pos)
2876 return bpf_iter_tcp_batch(seq);
2877
2878 return SEQ_START_TOKEN;
2879 }
2880
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2881 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2882 {
2883 struct bpf_tcp_iter_state *iter = seq->private;
2884 struct tcp_iter_state *st = &iter->state;
2885 struct sock *sk;
2886
2887 /* Whenever seq_next() is called, the iter->cur_sk is
2888 * done with seq_show(), so advance to the next sk in
2889 * the batch.
2890 */
2891 if (iter->cur_sk < iter->end_sk) {
2892 /* Keeping st->num consistent in tcp_iter_state.
2893 * bpf_iter_tcp does not use st->num.
2894 * meta.seq_num is used instead.
2895 */
2896 st->num++;
2897 /* Move st->offset to the next sk in the bucket such that
2898 * the future start() will resume at st->offset in
2899 * st->bucket. See tcp_seek_last_pos().
2900 */
2901 st->offset++;
2902 sock_put(iter->batch[iter->cur_sk++]);
2903 }
2904
2905 if (iter->cur_sk < iter->end_sk)
2906 sk = iter->batch[iter->cur_sk];
2907 else
2908 sk = bpf_iter_tcp_batch(seq);
2909
2910 ++*pos;
2911 /* Keeping st->last_pos consistent in tcp_iter_state.
2912 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2913 */
2914 st->last_pos = *pos;
2915 return sk;
2916 }
2917
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2918 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2919 {
2920 struct bpf_iter_meta meta;
2921 struct bpf_prog *prog;
2922 struct sock *sk = v;
2923 bool slow;
2924 uid_t uid;
2925 int ret;
2926
2927 if (v == SEQ_START_TOKEN)
2928 return 0;
2929
2930 if (sk_fullsock(sk))
2931 slow = lock_sock_fast(sk);
2932
2933 if (unlikely(sk_unhashed(sk))) {
2934 ret = SEQ_SKIP;
2935 goto unlock;
2936 }
2937
2938 if (sk->sk_state == TCP_TIME_WAIT) {
2939 uid = 0;
2940 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2941 const struct request_sock *req = v;
2942
2943 uid = from_kuid_munged(seq_user_ns(seq),
2944 sock_i_uid(req->rsk_listener));
2945 } else {
2946 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2947 }
2948
2949 meta.seq = seq;
2950 prog = bpf_iter_get_info(&meta, false);
2951 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2952
2953 unlock:
2954 if (sk_fullsock(sk))
2955 unlock_sock_fast(sk, slow);
2956 return ret;
2957
2958 }
2959
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2960 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2961 {
2962 struct bpf_tcp_iter_state *iter = seq->private;
2963 struct bpf_iter_meta meta;
2964 struct bpf_prog *prog;
2965
2966 if (!v) {
2967 meta.seq = seq;
2968 prog = bpf_iter_get_info(&meta, true);
2969 if (prog)
2970 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2971 }
2972
2973 if (iter->cur_sk < iter->end_sk) {
2974 bpf_iter_tcp_put_batch(iter);
2975 iter->st_bucket_done = false;
2976 }
2977 }
2978
2979 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2980 .show = bpf_iter_tcp_seq_show,
2981 .start = bpf_iter_tcp_seq_start,
2982 .next = bpf_iter_tcp_seq_next,
2983 .stop = bpf_iter_tcp_seq_stop,
2984 };
2985 #endif
seq_file_family(const struct seq_file * seq)2986 static unsigned short seq_file_family(const struct seq_file *seq)
2987 {
2988 const struct tcp_seq_afinfo *afinfo;
2989
2990 #ifdef CONFIG_BPF_SYSCALL
2991 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2992 if (seq->op == &bpf_iter_tcp_seq_ops)
2993 return AF_UNSPEC;
2994 #endif
2995
2996 /* Iterated from proc fs */
2997 afinfo = PDE_DATA(file_inode(seq->file));
2998 return afinfo->family;
2999 }
3000
3001 static const struct seq_operations tcp4_seq_ops = {
3002 .show = tcp4_seq_show,
3003 .start = tcp_seq_start,
3004 .next = tcp_seq_next,
3005 .stop = tcp_seq_stop,
3006 };
3007
3008 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3009 .family = AF_INET,
3010 };
3011
tcp4_proc_init_net(struct net * net)3012 static int __net_init tcp4_proc_init_net(struct net *net)
3013 {
3014 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3015 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3016 return -ENOMEM;
3017 return 0;
3018 }
3019
tcp4_proc_exit_net(struct net * net)3020 static void __net_exit tcp4_proc_exit_net(struct net *net)
3021 {
3022 remove_proc_entry("tcp", net->proc_net);
3023 }
3024
3025 static struct pernet_operations tcp4_net_ops = {
3026 .init = tcp4_proc_init_net,
3027 .exit = tcp4_proc_exit_net,
3028 };
3029
tcp4_proc_init(void)3030 int __init tcp4_proc_init(void)
3031 {
3032 return register_pernet_subsys(&tcp4_net_ops);
3033 }
3034
tcp4_proc_exit(void)3035 void tcp4_proc_exit(void)
3036 {
3037 unregister_pernet_subsys(&tcp4_net_ops);
3038 }
3039 #endif /* CONFIG_PROC_FS */
3040
3041 /* @wake is one when sk_stream_write_space() calls us.
3042 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3043 * This mimics the strategy used in sock_def_write_space().
3044 */
tcp_stream_memory_free(const struct sock * sk,int wake)3045 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3046 {
3047 const struct tcp_sock *tp = tcp_sk(sk);
3048 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3049 READ_ONCE(tp->snd_nxt);
3050
3051 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3052 }
3053 EXPORT_SYMBOL(tcp_stream_memory_free);
3054
3055 struct proto tcp_prot = {
3056 .name = "TCP",
3057 .owner = THIS_MODULE,
3058 .close = tcp_close,
3059 .pre_connect = tcp_v4_pre_connect,
3060 .connect = tcp_v4_connect,
3061 .disconnect = tcp_disconnect,
3062 .accept = inet_csk_accept,
3063 .ioctl = tcp_ioctl,
3064 .init = tcp_v4_init_sock,
3065 .destroy = tcp_v4_destroy_sock,
3066 .shutdown = tcp_shutdown,
3067 .setsockopt = tcp_setsockopt,
3068 .getsockopt = tcp_getsockopt,
3069 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3070 .keepalive = tcp_set_keepalive,
3071 .recvmsg = tcp_recvmsg,
3072 .sendmsg = tcp_sendmsg,
3073 .sendpage = tcp_sendpage,
3074 .backlog_rcv = tcp_v4_do_rcv,
3075 .release_cb = tcp_release_cb,
3076 .hash = inet_hash,
3077 .unhash = inet_unhash,
3078 .get_port = inet_csk_get_port,
3079 #ifdef CONFIG_BPF_SYSCALL
3080 .psock_update_sk_prot = tcp_bpf_update_proto,
3081 #endif
3082 .enter_memory_pressure = tcp_enter_memory_pressure,
3083 .leave_memory_pressure = tcp_leave_memory_pressure,
3084 .stream_memory_free = tcp_stream_memory_free,
3085 .sockets_allocated = &tcp_sockets_allocated,
3086 .orphan_count = &tcp_orphan_count,
3087 .memory_allocated = &tcp_memory_allocated,
3088 .memory_pressure = &tcp_memory_pressure,
3089 .sysctl_mem = sysctl_tcp_mem,
3090 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3091 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3092 .max_header = MAX_TCP_HEADER,
3093 .obj_size = sizeof(struct tcp_sock),
3094 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3095 .twsk_prot = &tcp_timewait_sock_ops,
3096 .rsk_prot = &tcp_request_sock_ops,
3097 .h.hashinfo = &tcp_hashinfo,
3098 .no_autobind = true,
3099 .diag_destroy = tcp_abort,
3100 };
3101 EXPORT_SYMBOL(tcp_prot);
3102
tcp_sk_exit(struct net * net)3103 static void __net_exit tcp_sk_exit(struct net *net)
3104 {
3105 int cpu;
3106
3107 if (net->ipv4.tcp_congestion_control)
3108 bpf_module_put(net->ipv4.tcp_congestion_control,
3109 net->ipv4.tcp_congestion_control->owner);
3110
3111 for_each_possible_cpu(cpu)
3112 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3113 free_percpu(net->ipv4.tcp_sk);
3114 }
3115
tcp_sk_init(struct net * net)3116 static int __net_init tcp_sk_init(struct net *net)
3117 {
3118 int res, cpu, cnt;
3119
3120 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3121 if (!net->ipv4.tcp_sk)
3122 return -ENOMEM;
3123
3124 for_each_possible_cpu(cpu) {
3125 struct sock *sk;
3126
3127 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3128 IPPROTO_TCP, net);
3129 if (res)
3130 goto fail;
3131 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3132
3133 /* Please enforce IP_DF and IPID==0 for RST and
3134 * ACK sent in SYN-RECV and TIME-WAIT state.
3135 */
3136 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3137
3138 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3139 }
3140
3141 net->ipv4.sysctl_tcp_ecn = 2;
3142 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3143
3144 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3145 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3146 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3147 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3148 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3149
3150 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3151 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3152 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3153
3154 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3155 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3156 net->ipv4.sysctl_tcp_syncookies = 1;
3157 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3158 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3159 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3160 net->ipv4.sysctl_tcp_orphan_retries = 0;
3161 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3162 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3163 net->ipv4.sysctl_tcp_tw_reuse = 2;
3164 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3165
3166 cnt = tcp_hashinfo.ehash_mask + 1;
3167 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3168 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3169
3170 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3171 net->ipv4.sysctl_tcp_sack = 1;
3172 net->ipv4.sysctl_tcp_window_scaling = 1;
3173 net->ipv4.sysctl_tcp_timestamps = 1;
3174 net->ipv4.sysctl_tcp_early_retrans = 3;
3175 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3176 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3177 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3178 net->ipv4.sysctl_tcp_max_reordering = 300;
3179 net->ipv4.sysctl_tcp_dsack = 1;
3180 net->ipv4.sysctl_tcp_app_win = 31;
3181 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3182 net->ipv4.sysctl_tcp_frto = 2;
3183 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3184 /* This limits the percentage of the congestion window which we
3185 * will allow a single TSO frame to consume. Building TSO frames
3186 * which are too large can cause TCP streams to be bursty.
3187 */
3188 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3189 /* Default TSQ limit of 16 TSO segments */
3190 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3191 /* rfc5961 challenge ack rate limiting */
3192 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3193 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3194 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3195 net->ipv4.sysctl_tcp_autocorking = 1;
3196 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3197 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3198 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3199 if (net != &init_net) {
3200 memcpy(net->ipv4.sysctl_tcp_rmem,
3201 init_net.ipv4.sysctl_tcp_rmem,
3202 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3203 memcpy(net->ipv4.sysctl_tcp_wmem,
3204 init_net.ipv4.sysctl_tcp_wmem,
3205 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3206 }
3207 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3208 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3209 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3210 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3211 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3212 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3213
3214 /* Reno is always built in */
3215 if (!net_eq(net, &init_net) &&
3216 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3217 init_net.ipv4.tcp_congestion_control->owner))
3218 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3219 else
3220 net->ipv4.tcp_congestion_control = &tcp_reno;
3221
3222 return 0;
3223 fail:
3224 tcp_sk_exit(net);
3225
3226 return res;
3227 }
3228
tcp_sk_exit_batch(struct list_head * net_exit_list)3229 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3230 {
3231 struct net *net;
3232
3233 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3234
3235 list_for_each_entry(net, net_exit_list, exit_list)
3236 tcp_fastopen_ctx_destroy(net);
3237 }
3238
3239 static struct pernet_operations __net_initdata tcp_sk_ops = {
3240 .init = tcp_sk_init,
3241 .exit = tcp_sk_exit,
3242 .exit_batch = tcp_sk_exit_batch,
3243 };
3244
3245 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3246 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3247 struct sock_common *sk_common, uid_t uid)
3248
3249 #define INIT_BATCH_SZ 16
3250
3251 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3252 {
3253 struct bpf_tcp_iter_state *iter = priv_data;
3254 int err;
3255
3256 err = bpf_iter_init_seq_net(priv_data, aux);
3257 if (err)
3258 return err;
3259
3260 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3261 if (err) {
3262 bpf_iter_fini_seq_net(priv_data);
3263 return err;
3264 }
3265
3266 return 0;
3267 }
3268
bpf_iter_fini_tcp(void * priv_data)3269 static void bpf_iter_fini_tcp(void *priv_data)
3270 {
3271 struct bpf_tcp_iter_state *iter = priv_data;
3272
3273 bpf_iter_fini_seq_net(priv_data);
3274 kvfree(iter->batch);
3275 }
3276
3277 static const struct bpf_iter_seq_info tcp_seq_info = {
3278 .seq_ops = &bpf_iter_tcp_seq_ops,
3279 .init_seq_private = bpf_iter_init_tcp,
3280 .fini_seq_private = bpf_iter_fini_tcp,
3281 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3282 };
3283
3284 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3285 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3286 const struct bpf_prog *prog)
3287 {
3288 switch (func_id) {
3289 case BPF_FUNC_setsockopt:
3290 return &bpf_sk_setsockopt_proto;
3291 case BPF_FUNC_getsockopt:
3292 return &bpf_sk_getsockopt_proto;
3293 default:
3294 return NULL;
3295 }
3296 }
3297
3298 static struct bpf_iter_reg tcp_reg_info = {
3299 .target = "tcp",
3300 .ctx_arg_info_size = 1,
3301 .ctx_arg_info = {
3302 { offsetof(struct bpf_iter__tcp, sk_common),
3303 PTR_TO_BTF_ID_OR_NULL },
3304 },
3305 .get_func_proto = bpf_iter_tcp_get_func_proto,
3306 .seq_info = &tcp_seq_info,
3307 };
3308
bpf_iter_register(void)3309 static void __init bpf_iter_register(void)
3310 {
3311 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3312 if (bpf_iter_reg_target(&tcp_reg_info))
3313 pr_warn("Warning: could not register bpf iterator tcp\n");
3314 }
3315
3316 #endif
3317
tcp_v4_init(void)3318 void __init tcp_v4_init(void)
3319 {
3320 if (register_pernet_subsys(&tcp_sk_ops))
3321 panic("Failed to create the TCP control socket.\n");
3322
3323 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3324 bpf_iter_register();
3325 #endif
3326 }
3327