tcp_ipv4.c source code [linux/net/ipv4/tcp_ipv4.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* IPv4 specific functions
10	*
11	* code split from:
12	* linux/ipv4/tcp.c
13	* linux/ipv4/tcp_input.c
14	* linux/ipv4/tcp_output.c
15	*
16	* See tcp.c for author information
17	*/
18
19	/*
20	* Changes:
21	* David S. Miller : New socket lookup architecture.
22	* This code is dedicated to John Dyson.
23	* David S. Miller : Change semantics of established hash,
24	* half is devoted to TIME_WAIT sockets
25	* and the rest go in the other half.
26	* Andi Kleen : Add support for syncookies and fixed
27	* some bugs: ip options weren't passed to
28	* the TCP layer, missed a check for an
29	* ACK bit.
30	* Andi Kleen : Implemented fast path mtu discovery.
31	* Fixed many serious bugs in the
32	* request_sock handling and moved
33	* most of it into the af independent code.
34	* Added tail drop and some other bugfixes.
35	* Added new listen semantics.
36	* Mike McLagan : Routing by source
37	* Juan Jose Ciarlante: ip_dynaddr bits
38	* Andi Kleen: various fixes.
39	* Vitaly E. Lavrov : Transparent proxy revived after year
40	* coma.
41	* Andi Kleen : Fix new listen.
42	* Andi Kleen : Fix accept error reporting.
43	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45	* a single port at the same time.
46	*/
47
48	#define pr_fmt(fmt) "TCP: " fmt
49
50	#include <linux/bottom_half.h>
51	#include <linux/types.h>
52	#include <linux/fcntl.h>
53	#include <linux/module.h>
54	#include <linux/random.h>
55	#include <linux/cache.h>
56	#include <linux/fips.h>
57	#include <linux/jhash.h>
58	#include <linux/init.h>
59	#include <linux/times.h>
60	#include <linux/slab.h>
61	#include <linux/sched.h>
62	#include <linux/sock_diag.h>
63
64	#include <net/aligned_data.h>
65	#include <net/net_namespace.h>
66	#include <net/icmp.h>
67	#include <net/inet_hashtables.h>
68	#include <net/tcp.h>
69	#include <net/tcp_ecn.h>
70	#include <net/transp_v6.h>
71	#include <net/ipv6.h>
72	#include <net/inet_common.h>
73	#include <net/inet_ecn.h>
74	#include <net/timewait_sock.h>
75	#include <net/xfrm.h>
76	#include <net/secure_seq.h>
77	#include <net/busy_poll.h>
78	#include <net/rstreason.h>
79	#include <net/psp.h>
80
81	#include <linux/inet.h>
82	#include <linux/ipv6.h>
83	#include <linux/stddef.h>
84	#include <linux/proc_fs.h>
85	#include <linux/seq_file.h>
86	#include <linux/inetdevice.h>
87	#include <linux/btf_ids.h>
88	#include <linux/skbuff_ref.h>
89
90	#include <crypto/md5.h>
91
92	#include <trace/events/tcp.h>
93
94	#ifdef CONFIG_TCP_MD5SIG
95	static void tcp_v4_md5_hash_hdr(char md5_hash, const* struct tcp_md5sig_key *key,
96	__be32 daddr, __be32 saddr, const struct tcphdr *th);
97	#endif
98
99	struct inet_hashinfo tcp_hashinfo;
100
101	static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
102	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
103	};
104
105	static DEFINE_MUTEX(tcp_exit_batch_mutex);
106
107	static u32 tcp_v4_init_seq(const struct sk_buff *skb)
108	{
109	return secure_tcp_seq(saddr: ip_hdr(skb)->daddr,
110	daddr: ip_hdr(skb)->saddr,
111	sport: tcp_hdr(skb)->dest,
112	dport: tcp_hdr(skb)->source);
113	}
114
115	static u32 tcp_v4_init_ts_off(const struct net net, const* struct sk_buff *skb)
116	{
117	return secure_tcp_ts_off(net, saddr: ip_hdr(skb)->daddr, daddr: ip_hdr(skb)->saddr);
118	}
119
120	int tcp_twsk_unique(struct sock sk, struct* sock sktw, void* *twp)
121	{
122	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
123	const struct inet_timewait_sock *tw = inet_twsk(sk: sktw);
124	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk: sktw);
125	struct tcp_sock *tp = tcp_sk(sk);
126	int ts_recent_stamp;
127	u32 reuse_thresh;
128
129	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
130	reuse = `0`;
131
132	if (reuse == `2`) {
133	/ Still does not detect everything that goes through*
134	* lo, since we require a loopback src or dst address
135	* or direct binding to 'lo' interface.
136	*/
137	bool loopback = false;
138	if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
139	loopback = true;
140	#if IS_ENABLED(CONFIG_IPV6)
141	if (tw->tw_family == AF_INET6) {
142	if (ipv6_addr_loopback(a: &tw->tw_v6_daddr) \|\|
143	ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_daddr) \|\|
144	ipv6_addr_loopback(a: &tw->tw_v6_rcv_saddr) \|\|
145	ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_rcv_saddr))
146	loopback = true;
147	} else
148	#endif
149	{
150	if (ipv4_is_loopback(addr: tw->tw_daddr) \|\|
151	ipv4_is_loopback(addr: tw->tw_rcv_saddr))
152	loopback = true;
153	}
154	if (!loopback)
155	reuse = `0`;
156	}
157
158	/ With PAWS, it is safe from the viewpoint*
159	of data integrity. Even without PAWS it is safe provided sequence
160	spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161
162	Actually, the idea is close to VJ's one, only timestamp cache is
163	held not per host, but per port pair and TW bucket is used as state
164	holder.
165
166	If TW bucket has been already destroyed we fall back to VJ's scheme
167	and use initial timestamp retrieved from peer table.
168	*/
169	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
170	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
171	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
172	if (ts_recent_stamp &&
173	(!twp \|\| (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
174	/ inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk*
175	* and releasing the bucket lock.
176	*/
177	if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
178	return `0`;
179
180	/ In case of repair and re-using TIME-WAIT sockets we still*
181	* want to be sure that it is safe as above but honor the
182	* sequence numbers and time stamps set as part of the repair
183	* process.
184	*
185	* Without this check re-using a TIME-WAIT socket with TCP
186	* repair would accumulate a -1 on the repair assigned
187	* sequence number. The first time it is reused the sequence
188	* is -1, the second time -2, etc. This fixes that issue
189	* without appearing to create any others.
190	*/
191	if (likely(!tp->repair)) {
192	u32 seq = tcptw->tw_snd_nxt + `65535` + `2`;
193
194	if (!seq)
195	seq = `1`;
196	WRITE_ONCE(tp->write_seq, seq);
197	tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
198	tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
199	}
200
201	return `1`;
202	}
203
204	return `0`;
205	}
206	EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
207
208	static int tcp_v4_pre_connect(struct sock sk, struct* sockaddr_unsized *uaddr,
209	int addr_len)
210	{
211	/ This check is replicated from tcp_v4_connect() and intended to*
212	* prevent BPF program called below from accessing bytes that are out
213	* of the bound specified by user in addr_len.
214	*/
215	if (addr_len < sizeof(struct sockaddr_in))
216	return -EINVAL;
217
218	sock_owned_by_me(sk);
219
220	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
221	}
222
223	/ This will initiate an outgoing connection. /
224	int tcp_v4_connect(struct sock sk, struct* sockaddr_unsized uaddr, int* addr_len)
225	{
226	struct sockaddr_in usin = (struct* sockaddr_in *)uaddr;
227	struct inet_timewait_death_row *tcp_death_row;
228	struct inet_sock *inet = inet_sk(sk);
229	struct tcp_sock *tp = tcp_sk(sk);
230	struct ip_options_rcu *inet_opt;
231	struct net *net = sock_net(sk);
232	__be16 orig_sport, orig_dport;
233	__be32 daddr, nexthop;
234	struct flowi4 *fl4;
235	struct rtable *rt;
236	int err;
237
238	if (addr_len < sizeof(struct sockaddr_in))
239	return -EINVAL;
240
241	if (usin->sin_family != AF_INET)
242	return -EAFNOSUPPORT;
243
244	nexthop = daddr = usin->sin_addr.s_addr;
245	inet_opt = rcu_dereference_protected(inet->inet_opt,
246	lockdep_sock_is_held(sk));
247	if (inet_opt && inet_opt->opt.srr) {
248	if (!daddr)
249	return -EINVAL;
250	nexthop = inet_opt->opt.faddr;
251	}
252
253	orig_sport = inet->inet_sport;
254	orig_dport = usin->sin_port;
255	fl4 = &inet->cork.fl.u.ip4;
256	rt = ip_route_connect(fl4, dst: nexthop, src: inet->inet_saddr,
257	oif: sk->sk_bound_dev_if, IPPROTO_TCP, sport: orig_sport,
258	dport: orig_dport, sk);
259	if (IS_ERR(ptr: rt)) {
260	err = PTR_ERR(ptr: rt);
261	if (err == -ENETUNREACH)
262	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
263	return err;
264	}
265
266	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
267	ip_rt_put(rt);
268	return -ENETUNREACH;
269	}
270
271	if (!inet_opt \|\| !inet_opt->opt.srr)
272	daddr = fl4->daddr;
273
274	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
275
276	if (!inet->inet_saddr) {
277	err = inet_bhash2_update_saddr(sk, saddr: &fl4->saddr, AF_INET);
278	if (err) {
279	ip_rt_put(rt);
280	return err;
281	}
282	} else {
283	sk_rcv_saddr_set(sk, addr: inet->inet_saddr);
284	}
285
286	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
287	/ Reset inherited state /
288	tp->rx_opt.ts_recent = `0`;
289	tp->rx_opt.ts_recent_stamp = `0`;
290	if (likely(!tp->repair))
291	WRITE_ONCE(tp->write_seq, `0`);
292	}
293
294	inet->inet_dport = usin->sin_port;
295	sk_daddr_set(sk, addr: daddr);
296
297	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
298	if (inet_opt)
299	inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
300
301	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
302
303	/ Socket identity is still unknown (sport may be zero).*
304	* However we set state to SYN-SENT and not releasing socket
305	* lock select source port, enter ourselves into the hash tables and
306	* complete initialization after this.
307	*/
308	tcp_set_state(sk, state: TCP_SYN_SENT);
309	err = inet_hash_connect(death_row: tcp_death_row, sk);
310	if (err)
311	goto failure;
312
313	sk_set_txhash(sk);
314
315	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
316	sport: inet->inet_sport, dport: inet->inet_dport, sk);
317	if (IS_ERR(ptr: rt)) {
318	err = PTR_ERR(ptr: rt);
319	rt = NULL;
320	goto failure;
321	}
322	tp->tcp_usec_ts = dst_tcp_usec_ts(dst: &rt->dst);
323	/ OK, now commit destination to socket. /
324	sk->sk_gso_type = SKB_GSO_TCPV4;
325	sk_setup_caps(sk, dst: &rt->dst);
326	rt = NULL;
327
328	if (likely(!tp->repair)) {
329	if (!tp->write_seq)
330	WRITE_ONCE(tp->write_seq,
331	secure_tcp_seq(inet->inet_saddr,
332	inet->inet_daddr,
333	inet->inet_sport,
334	usin->sin_port));
335	WRITE_ONCE(tp->tsoffset,
336	secure_tcp_ts_off(net, inet->inet_saddr,
337	inet->inet_daddr));
338	}
339
340	atomic_set(v: &inet->inet_id, i: get_random_u16());
341
342	if (tcp_fastopen_defer_connect(sk, err: &err))
343	return err;
344	if (err)
345	goto failure;
346
347	err = tcp_connect(sk);
348
349	if (err)
350	goto failure;
351
352	return `0`;
353
354	failure:
355	/*
356	* This unhashes the socket and releases the local port,
357	* if necessary.
358	*/
359	tcp_set_state(sk, state: TCP_CLOSE);
360	inet_bhash2_reset_saddr(sk);
361	ip_rt_put(rt);
362	sk->sk_route_caps = `0`;
363	inet->inet_dport = `0`;
364	return err;
365	}
366	EXPORT_IPV6_MOD(tcp_v4_connect);
367
368	/*
369	* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
370	* It can be called through tcp_release_cb() if socket was owned by user
371	* at the time tcp_v4_err() was called to handle ICMP message.
372	*/
373	void tcp_v4_mtu_reduced(struct sock *sk)
374	{
375	struct inet_sock *inet = inet_sk(sk);
376	struct dst_entry *dst;
377	u32 mtu;
378
379	if ((`1` << sk->sk_state) & (TCPF_LISTEN \| TCPF_CLOSE))
380	return;
381	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
382	dst = inet_csk_update_pmtu(sk, mtu);
383	if (!dst)
384	return;
385
386	/ Something is about to be wrong... Remember soft error*
387	* for the case, if this connection will not able to recover.
388	*/
389	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
390	WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
391
392	mtu = dst_mtu(dst);
393
394	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
395	ip_sk_accept_pmtu(sk) &&
396	inet_csk(sk)->icsk_pmtu_cookie > mtu) {
397	tcp_sync_mss(sk, pmtu: mtu);
398
399	/ Resend the TCP packet because it's*
400	* clear that the old packet has been
401	* dropped. This is the new "fast" path mtu
402	* discovery.
403	*/
404	tcp_simple_retransmit(sk);
405	} / else let the usual retransmit timer handle it /
406	}
407	EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
408
409	static void do_redirect(struct sk_buff skb, struct* sock *sk)
410	{
411	struct dst_entry *dst = __sk_dst_check(sk, cookie: `0`);
412
413	if (dst)
414	dst->ops->redirect(dst, sk, skb);
415	}
416
417
418	/ handle ICMP messages on TCP_NEW_SYN_RECV request sockets /
419	void tcp_req_err(struct sock *sk, u32 seq, bool abort)
420	{
421	struct request_sock *req = inet_reqsk(sk);
422	struct net *net = sock_net(sk);
423
424	/ ICMPs are not backlogged, hence we cannot get*
425	* an established socket here.
426	*/
427	if (seq != tcp_rsk(req)->snt_isn) {
428	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
429	} else if (abort) {
430	/*
431	* Still in SYN_RECV, just remove it silently.
432	* There is no good way to pass the error to the newly
433	* created socket, and POSIX does not want network
434	* errors returned from accept().
435	*/
436	inet_csk_reqsk_queue_drop(sk: req->rsk_listener, req);
437	tcp_listendrop(sk: req->rsk_listener);
438	}
439	reqsk_put(req);
440	}
441	EXPORT_IPV6_MOD(tcp_req_err);
442
443	/ TCP-LD (RFC 6069) logic /
444	void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
445	{
446	struct inet_connection_sock *icsk = inet_csk(sk);
447	struct tcp_sock *tp = tcp_sk(sk);
448	struct sk_buff *skb;
449	s32 remaining;
450	u32 delta_us;
451
452	if (sock_owned_by_user(sk))
453	return;
454
455	if (seq != tp->snd_una \|\| !icsk->icsk_retransmits \|\|
456	!icsk->icsk_backoff)
457	return;
458
459	skb = tcp_rtx_queue_head(sk);
460	if (WARN_ON_ONCE(!skb))
461	return;
462
463	icsk->icsk_backoff--;
464	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
465	icsk->icsk_rto = inet_csk_rto_backoff(icsk, max_when: tcp_rto_max(sk));
466
467	tcp_mstamp_refresh(tp);
468	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
469	remaining = icsk->icsk_rto - usecs_to_jiffies(u: delta_us);
470
471	if (remaining > `0`) {
472	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, when: remaining, pace_delay: false);
473	} else {
474	/ RTO revert clocked out retransmission.*
475	* Will retransmit now.
476	*/
477	tcp_retransmit_timer(sk);
478	}
479	}
480	EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
481
482	/*
483	* This routine is called by the ICMP module when it gets some
484	* sort of error condition. If err < 0 then the socket should
485	* be closed and the error returned to the user. If err > 0
486	* it's just the icmp type << 8 \| icmp code. After adjustment
487	* header points to the first 8 bytes of the tcp header. We need
488	* to find the appropriate port.
489	*
490	* The locking strategy used here is very "optimistic". When
491	* someone else accesses the socket the ICMP is just dropped
492	* and for some paths there is no check at all.
493	* A more general error queue to queue errors for later handling
494	* is probably better.
495	*
496	*/
497
498	int tcp_v4_err(struct sk_buff *skb, u32 info)
499	{
500	const struct iphdr iph = (const* struct iphdr *)skb->data;
501	struct tcphdr th = (struct* tcphdr *)(skb->data + (iph->ihl << `2`));
502	struct net *net = dev_net_rcu(dev: skb->dev);
503	const int type = icmp_hdr(skb)->type;
504	const int code = icmp_hdr(skb)->code;
505	struct request_sock *fastopen;
506	struct tcp_sock *tp;
507	u32 seq, snd_una;
508	struct sock *sk;
509	int err;
510
511	sk = __inet_lookup_established(net, saddr: iph->daddr, sport: th->dest, daddr: iph->saddr,
512	ntohs(th->source), dif: inet_iif(skb), sdif: `0`);
513	if (!sk) {
514	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
515	return -ENOENT;
516	}
517	if (sk->sk_state == TCP_TIME_WAIT) {
518	/ To increase the counter of ignored icmps for TCP-AO /
519	tcp_ao_ignore_icmp(sk, AF_INET, type, code);
520	inet_twsk_put(tw: inet_twsk(sk));
521	return `0`;
522	}
523	seq = ntohl(th->seq);
524	if (sk->sk_state == TCP_NEW_SYN_RECV) {
525	tcp_req_err(sk, seq, abort: type == ICMP_PARAMETERPROB \|\|
526	type == ICMP_TIME_EXCEEDED \|\|
527	(type == ICMP_DEST_UNREACH &&
528	(code == ICMP_NET_UNREACH \|\|
529	code == ICMP_HOST_UNREACH)));
530	return `0`;
531	}
532
533	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
534	sock_put(sk);
535	return `0`;
536	}
537
538	bh_lock_sock(sk);
539	/ If too many ICMPs get dropped on busy*
540	* servers this needs to be solved differently.
541	* We do take care of PMTU discovery (RFC1191) special case :
542	* we can receive locally generated ICMP messages while socket is held.
543	*/
544	if (sock_owned_by_user(sk)) {
545	if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
546	__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
547	}
548	if (sk->sk_state == TCP_CLOSE)
549	goto out;
550
551	if (static_branch_unlikely(&ip4_min_ttl)) {
552	/ min_ttl can be changed concurrently from do_ip_setsockopt() /
553	if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
554	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
555	goto out;
556	}
557	}
558
559	tp = tcp_sk(sk);
560	/ XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() /
561	fastopen = rcu_dereference(tp->fastopen_rsk);
562	snd_una = fastopen ? tcp_rsk(req: fastopen)->snt_isn : tp->snd_una;
563	if (sk->sk_state != TCP_LISTEN &&
564	!between(seq1: seq, seq2: snd_una, seq3: tp->snd_nxt)) {
565	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
566	goto out;
567	}
568
569	switch (type) {
570	case ICMP_REDIRECT:
571	if (!sock_owned_by_user(sk))
572	do_redirect(skb, sk);
573	goto out;
574	case ICMP_SOURCE_QUENCH:
575	/ Just silently ignore these. /
576	goto out;
577	case ICMP_PARAMETERPROB:
578	err = EPROTO;
579	break;
580	case ICMP_DEST_UNREACH:
581	if (code > NR_ICMP_UNREACH)
582	goto out;
583
584	if (code == ICMP_FRAG_NEEDED) { / PMTU discovery (RFC1191) /
585	/ We are not interested in TCP_LISTEN and open_requests*
586	* (SYN-ACKs send out by Linux are always <576bytes so
587	* they should go through unfragmented).
588	*/
589	if (sk->sk_state == TCP_LISTEN)
590	goto out;
591
592	WRITE_ONCE(tp->mtu_info, info);
593	if (!sock_owned_by_user(sk)) {
594	tcp_v4_mtu_reduced(sk);
595	} else {
596	if (!test_and_set_bit(nr: TCP_MTU_REDUCED_DEFERRED, addr: &sk->sk_tsq_flags))
597	sock_hold(sk);
598	}
599	goto out;
600	}
601
602	err = icmp_err_convert[code].errno;
603	/ check if this ICMP message allows revert of backoff.*
604	* (see RFC 6069)
605	*/
606	if (!fastopen &&
607	(code == ICMP_NET_UNREACH \|\| code == ICMP_HOST_UNREACH))
608	tcp_ld_RTO_revert(sk, seq);
609	break;
610	case ICMP_TIME_EXCEEDED:
611	err = EHOSTUNREACH;
612	break;
613	default:
614	goto out;
615	}
616
617	switch (sk->sk_state) {
618	case TCP_SYN_SENT:
619	case TCP_SYN_RECV:
620	/ Only in fast or simultaneous open. If a fast open socket is*
621	* already accepted it is treated as a connected one below.
622	*/
623	if (fastopen && !fastopen->sk)
624	break;
625
626	ip_icmp_error(sk, skb, err, port: th->dest, info, payload: (u8 *)th);
627
628	if (!sock_owned_by_user(sk))
629	tcp_done_with_error(sk, err);
630	else
631	WRITE_ONCE(sk->sk_err_soft, err);
632	goto out;
633	}
634
635	/ If we've already connected we will keep trying*
636	* until we time out, or the user gives up.
637	*
638	* rfc1122 4.2.3.9 allows to consider as hard errors
639	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
640	* but it is obsoleted by pmtu discovery).
641	*
642	* Note, that in modern internet, where routing is unreliable
643	* and in each dark corner broken firewalls sit, sending random
644	* errors ordered by their masters even this two messages finally lose
645	* their original sense (even Linux sends invalid PORT_UNREACHs)
646	*
647	* Now we are in compliance with RFCs.
648	* --ANK (980905)
649	*/
650
651	if (!sock_owned_by_user(sk) &&
652	inet_test_bit(RECVERR, sk)) {
653	WRITE_ONCE(sk->sk_err, err);
654	sk_error_report(sk);
655	} else { / Only an error on timeout /
656	WRITE_ONCE(sk->sk_err_soft, err);
657	}
658
659	out:
660	bh_unlock_sock(sk);
661	sock_put(sk);
662	return `0`;
663	}
664
665	void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
666	{
667	struct tcphdr *th = tcp_hdr(skb);
668
669	th->check = ~tcp_v4_check(len: skb->len, saddr, daddr, base: `0`);
670	skb->csum_start = skb_transport_header(skb) - skb->head;
671	skb->csum_offset = offsetof(struct tcphdr, check);
672	}
673
674	/ This routine computes an IPv4 TCP checksum. /
675	void tcp_v4_send_check(struct sock sk, struct* sk_buff *skb)
676	{
677	const struct inet_sock *inet = inet_sk(sk);
678
679	__tcp_v4_send_check(skb, saddr: inet->inet_saddr, daddr: inet->inet_daddr);
680	}
681	EXPORT_IPV6_MOD(tcp_v4_send_check);
682
683	#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
684
685	static bool tcp_v4_ao_sign_reset(const struct sock sk, struct* sk_buff *skb,
686	const struct tcp_ao_hdr *aoh,
687	struct ip_reply_arg arg, struct* tcphdr *reply,
688	__be32 reply_options[REPLY_OPTIONS_LEN])
689	{
690	#ifdef CONFIG_TCP_AO
691	int sdif = tcp_v4_sdif(skb);
692	int dif = inet_iif(skb);
693	int l3index = sdif ? dif : `0`;
694	bool allocated_traffic_key;
695	struct tcp_ao_key *key;
696	char *traffic_key;
697	bool drop = true;
698	u32 ao_sne = `0`;
699	u8 keyid;
700
701	rcu_read_lock();
702	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
703	key: &key, traffic_key: &traffic_key, allocated_traffic_key: &allocated_traffic_key,
704	keyid: &keyid, sne: &ao_sne))
705	goto out;
706
707	reply_options[`0`] = htonl((TCPOPT_AO << `24`) \| (tcp_ao_len(key) << `16`) \|
708	(aoh->rnext_keyid << `8`) \| keyid);
709	arg->iov[`0`].iov_len += tcp_ao_len_aligned(key);
710	reply->doff = arg->iov[`0`].iov_len / `4`;
711
712	if (tcp_ao_hash_hdr(AF_INET, ao_hash: (char *)&reply_options[`1`],
713	key, tkey: traffic_key,
714	daddr: (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
715	saddr: (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
716	th: reply, sne: ao_sne))
717	goto out;
718	drop = false;
719	out:
720	rcu_read_unlock();
721	if (allocated_traffic_key)
722	kfree(objp: traffic_key);
723	return drop;
724	#else
725	return true;
726	#endif
727	}
728
729	/*
730	* This routine will send an RST to the other tcp.
731	*
732	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
733	* for reset.
734	* Answer: if a packet caused RST, it is not for a socket
735	* existing in our system, if it is matched to a socket,
736	* it is just duplicate segment or bug in other side's TCP.
737	* So that we build reply only basing on parameters
738	* arrived with segment.
739	* Exception: precedence violation. We do not implement it in any case.
740	*/
741
742	static void tcp_v4_send_reset(const struct sock sk, struct* sk_buff *skb,
743	enum sk_rst_reason reason)
744	{
745	const struct tcphdr *th = tcp_hdr(skb);
746	struct {
747	struct tcphdr th;
748	__be32 opt[REPLY_OPTIONS_LEN];
749	} rep;
750	const __u8 *md5_hash_location = NULL;
751	const struct tcp_ao_hdr *aoh;
752	struct ip_reply_arg arg;
753	#ifdef CONFIG_TCP_MD5SIG
754	struct tcp_md5sig_key *key = NULL;
755	unsigned char newhash[`16`];
756	struct sock *sk1 = NULL;
757	#endif
758	u64 transmit_time = `0`;
759	struct sock *ctl_sk;
760	struct net *net;
761	u32 txhash = `0`;
762
763	/ Never send a reset in response to a reset. /
764	if (th->rst)
765	return;
766
767	/ If sk not NULL, it means we did a successful lookup and incoming*
768	* route had to be correct. prequeue might have dropped our dst.
769	*/
770	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
771	return;
772
773	/ Swap the send and the receive. /
774	memset(&rep, `0`, sizeof(rep));
775	rep.th.dest = th->source;
776	rep.th.source = th->dest;
777	rep.th.doff = sizeof(struct tcphdr) / `4`;
778	rep.th.rst = `1`;
779
780	if (th->ack) {
781	rep.th.seq = th->ack_seq;
782	} else {
783	rep.th.ack = `1`;
784	rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
785	skb->len - (th->doff << `2`));
786	}
787
788	memset(&arg, `0`, sizeof(arg));
789	arg.iov[`0`].iov_base = (unsigned char *)&rep;
790	arg.iov[`0`].iov_len = sizeof(rep.th);
791
792	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
793
794	/ Invalid TCP option size or twice included auth /
795	if (tcp_parse_auth_options(th: tcp_hdr(skb), md5_hash: &md5_hash_location, aoh: &aoh))
796	return;
797
798	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, arg: &arg, reply: &rep.th, reply_options: rep.opt))
799	return;
800
801	#ifdef CONFIG_TCP_MD5SIG
802	rcu_read_lock();
803	if (sk && sk_fullsock(sk)) {
804	const union tcp_md5_addr *addr;
805	int l3index;
806
807	/ sdif set, means packet ingressed via a device*
808	* in an L3 domain and inet_iif is set to it.
809	*/
810	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
811	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
812	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
813	} else if (md5_hash_location) {
814	const union tcp_md5_addr *addr;
815	int sdif = tcp_v4_sdif(skb);
816	int dif = inet_iif(skb);
817	int l3index;
818
819	/*
820	* active side is lost. Try to find listening socket through
821	* source port, and then find md5 key through listening socket.
822	* we are not loose security here:
823	* Incoming packet is checked with md5 hash with finding key,
824	* no RST generated if md5 hash doesn't match.
825	*/
826	sk1 = __inet_lookup_listener(net, NULL, doff: `0`, saddr: ip_hdr(skb)->saddr,
827	sport: th->source, daddr: ip_hdr(skb)->daddr,
828	ntohs(th->source), dif, sdif);
829	/ don't send rst if it can't find key /
830	if (!sk1)
831	goto out;
832
833	/ sdif set, means packet ingressed via a device*
834	* in an L3 domain and dif is set to it.
835	*/
836	l3index = sdif ? dif : `0`;
837	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
838	key = tcp_md5_do_lookup(sk: sk1, l3index, addr, AF_INET);
839	if (!key)
840	goto out;
841
842	tcp_v4_md5_hash_skb(md5_hash: newhash, key, NULL, skb);
843	if (memcmp(p: md5_hash_location, q: newhash, size: `16`) != `0`)
844	goto out;
845	}
846
847	if (key) {
848	rep.opt[`0`] = htonl((TCPOPT_NOP << `24`) \|
849	(TCPOPT_NOP << `16`) \|
850	(TCPOPT_MD5SIG << `8`) \|
851	TCPOLEN_MD5SIG);
852	/ Update length and the length the header thinks exists /
853	arg.iov[`0`].iov_len += TCPOLEN_MD5SIG_ALIGNED;
854	rep.th.doff = arg.iov[`0`].iov_len / `4`;
855
856	tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[`1`],
857	key, daddr: ip_hdr(skb)->saddr,
858	saddr: ip_hdr(skb)->daddr, th: &rep.th);
859	}
860	#endif
861	/ Can't co-exist with TCPMD5, hence check rep.opt[0] /
862	if (rep.opt[`0`] == `0`) {
863	__be32 mrst = mptcp_reset_option(skb);
864
865	if (mrst) {
866	rep.opt[`0`] = mrst;
867	arg.iov[`0`].iov_len += sizeof(mrst);
868	rep.th.doff = arg.iov[`0`].iov_len / `4`;
869	}
870	}
871
872	arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
873	daddr: ip_hdr(skb)->saddr, / XXX /
874	len: arg.iov[`0`].iov_len, IPPROTO_TCP, sum: `0`);
875	arg.csumoffset = offsetof(struct tcphdr, check) / `2`;
876	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : `0`;
877
878	/ When socket is gone, all binding information is lost.*
879	* routing might fail in this case. No choice here, if we choose to force
880	* input interface, we will misroute in case of asymmetric route.
881	*/
882	if (sk)
883	arg.bound_dev_if = sk->sk_bound_dev_if;
884
885	trace_tcp_send_reset(sk, skb__nullable: skb, reason);
886
887	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
888	offsetof(struct inet_timewait_sock, tw_bound_dev_if));
889
890	/ ECN bits of TW reset are cleared /
891	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
892	arg.uid = sock_net_uid(net, sk: sk && sk_fullsock(sk) ? sk : NULL);
893	local_bh_disable();
894	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
895	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
896
897	sock_net_set(sk: ctl_sk, net);
898	if (sk) {
899	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
900	inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
901	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
902	inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
903	transmit_time = tcp_transmit_time(sk);
904	xfrm_sk_clone_policy(sk: ctl_sk, osk: sk);
905	txhash = (sk->sk_state == TCP_TIME_WAIT) ?
906	inet_twsk(sk)->tw_txhash : sk->sk_txhash;
907	} else {
908	ctl_sk->sk_mark = `0`;
909	ctl_sk->sk_priority = `0`;
910	}
911	ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
912	skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
913	daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
914	arg: &arg, len: arg.iov[`0`].iov_len,
915	transmit_time, txhash);
916
917	xfrm_sk_free_policy(sk: ctl_sk);
918	sock_net_set(sk: ctl_sk, net: &init_net);
919	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
920	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
921	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
922	local_bh_enable();
923
924	#ifdef CONFIG_TCP_MD5SIG
925	out:
926	rcu_read_unlock();
927	#endif
928	}
929
930	/ The code following below sending ACKs in SYN-RECV and TIME-WAIT states*
931	outside socket context is ugly, certainly. What can I do?
932	*/
933
934	static void tcp_v4_send_ack(const struct sock *sk,
935	struct sk_buff *skb, u32 seq, u32 ack,
936	u32 win, u32 tsval, u32 tsecr, int oif,
937	struct tcp_key *key,
938	int reply_flags, u8 tos, u32 txhash)
939	{
940	const struct tcphdr *th = tcp_hdr(skb);
941	struct {
942	struct tcphdr th;
943	__be32 opt[(MAX_TCP_OPTION_SPACE >> `2`)];
944	} rep;
945	struct net *net = sock_net(sk);
946	struct ip_reply_arg arg;
947	struct sock *ctl_sk;
948	u64 transmit_time;
949
950	memset(&rep.th, `0`, sizeof(struct tcphdr));
951	memset(&arg, `0`, sizeof(arg));
952
953	arg.iov[`0`].iov_base = (unsigned char *)&rep;
954	arg.iov[`0`].iov_len = sizeof(rep.th);
955	if (tsecr) {
956	rep.opt[`0`] = htonl((TCPOPT_NOP << `24`) \| (TCPOPT_NOP << `16`) \|
957	(TCPOPT_TIMESTAMP << `8`) \|
958	TCPOLEN_TIMESTAMP);
959	rep.opt[`1`] = htonl(tsval);
960	rep.opt[`2`] = htonl(tsecr);
961	arg.iov[`0`].iov_len += TCPOLEN_TSTAMP_ALIGNED;
962	}
963
964	/ Swap the send and the receive. /
965	rep.th.dest = th->source;
966	rep.th.source = th->dest;
967	rep.th.doff = arg.iov[`0`].iov_len / `4`;
968	rep.th.seq = htonl(seq);
969	rep.th.ack_seq = htonl(ack);
970	rep.th.ack = `1`;
971	rep.th.window = htons(win);
972
973	#ifdef CONFIG_TCP_MD5SIG
974	if (tcp_key_is_md5(key)) {
975	int offset = (tsecr) ? `3` : `0`;
976
977	rep.opt[offset++] = htonl((TCPOPT_NOP << `24`) \|
978	(TCPOPT_NOP << `16`) \|
979	(TCPOPT_MD5SIG << `8`) \|
980	TCPOLEN_MD5SIG);
981	arg.iov[`0`].iov_len += TCPOLEN_MD5SIG_ALIGNED;
982	rep.th.doff = arg.iov[`0`].iov_len/`4`;
983
984	tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[offset],
985	key: key->md5_key, daddr: ip_hdr(skb)->saddr,
986	saddr: ip_hdr(skb)->daddr, th: &rep.th);
987	}
988	#endif
989	#ifdef CONFIG_TCP_AO
990	if (tcp_key_is_ao(key)) {
991	int offset = (tsecr) ? `3` : `0`;
992
993	rep.opt[offset++] = htonl((TCPOPT_AO << `24`) \|
994	(tcp_ao_len(key->ao_key) << `16`) \|
995	(key->ao_key->sndid << `8`) \|
996	key->rcv_next);
997	arg.iov[`0`].iov_len += tcp_ao_len_aligned(key: key->ao_key);
998	rep.th.doff = arg.iov[`0`].iov_len / `4`;
999
1000	tcp_ao_hash_hdr(AF_INET, ao_hash: (char *)&rep.opt[offset],
1001	key: key->ao_key, tkey: key->traffic_key,
1002	daddr: (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1003	saddr: (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1004	th: &rep.th, sne: key->sne);
1005	}
1006	#endif
1007	arg.flags = reply_flags;
1008	arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
1009	daddr: ip_hdr(skb)->saddr, / XXX /
1010	len: arg.iov[`0`].iov_len, IPPROTO_TCP, sum: `0`);
1011	arg.csumoffset = offsetof(struct tcphdr, check) / `2`;
1012	if (oif)
1013	arg.bound_dev_if = oif;
1014	arg.tos = tos;
1015	arg.uid = sock_net_uid(net, sk: sk_fullsock(sk) ? sk : NULL);
1016	local_bh_disable();
1017	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1018	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1019	sock_net_set(sk: ctl_sk, net);
1020	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1021	inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1022	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1023	inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1024	transmit_time = tcp_transmit_time(sk);
1025	ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
1026	skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
1027	daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
1028	arg: &arg, len: arg.iov[`0`].iov_len,
1029	transmit_time, txhash);
1030
1031	sock_net_set(sk: ctl_sk, net: &init_net);
1032	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1033	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1034	local_bh_enable();
1035	}
1036
1037	static void tcp_v4_timewait_ack(struct sock sk, struct* sk_buff *skb,
1038	enum tcp_tw_status tw_status)
1039	{
1040	struct inet_timewait_sock *tw = inet_twsk(sk);
1041	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1042	struct tcp_key key = {};
1043	u8 tos = tw->tw_tos;
1044
1045	/ Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,*
1046	* while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1047	* being placed in a different service queues (Classic rather than L4S)
1048	*/
1049	if (tw_status == TCP_TW_ACK_OOW)
1050	tos &= ~INET_ECN_MASK;
1051
1052	#ifdef CONFIG_TCP_AO
1053	struct tcp_ao_info *ao_info;
1054
1055	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1056	/ FIXME: the segment to-be-acked is not verified yet /
1057	ao_info = rcu_dereference(tcptw->ao_info);
1058	if (ao_info) {
1059	const struct tcp_ao_hdr *aoh;
1060
1061	if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh)) {
1062	inet_twsk_put(tw);
1063	return;
1064	}
1065
1066	if (aoh)
1067	key.ao_key = tcp_ao_established_key(sk, ao: ao_info,
1068	sndid: aoh->rnext_keyid, rcvid: -`1`);
1069	}
1070	}
1071	if (key.ao_key) {
1072	struct tcp_ao_key *rnext_key;
1073
1074	key.traffic_key = snd_other_key(key: key.ao_key);
1075	key.sne = READ_ONCE(ao_info->snd_sne);
1076	rnext_key = READ_ONCE(ao_info->rnext_key);
1077	key.rcv_next = rnext_key->rcvid;
1078	key.type = TCP_KEY_AO;
1079	#else
1080	if (`0`) {
1081	#endif
1082	} else if (static_branch_tcp_md5()) {
1083	key.md5_key = tcp_twsk_md5_key(tcptw);
1084	if (key.md5_key)
1085	key.type = TCP_KEY_MD5;
1086	}
1087
1088	tcp_v4_send_ack(sk, skb,
1089	seq: tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1090	win: tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1091	tsval: tcp_tw_tsval(tcptw),
1092	READ_ONCE(tcptw->tw_ts_recent),
1093	oif: tw->tw_bound_dev_if, key: &key,
1094	reply_flags: tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : `0`,
1095	tos,
1096	txhash: tw->tw_txhash);
1097
1098	inet_twsk_put(tw);
1099	}
1100
1101	static void tcp_v4_reqsk_send_ack(const struct sock sk, struct* sk_buff *skb,
1102	struct request_sock *req)
1103	{
1104	struct tcp_key key = {};
1105
1106	/ sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV*
1107	* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1108	*/
1109	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + `1` :
1110	tcp_sk(sk)->snd_nxt;
1111
1112	#ifdef CONFIG_TCP_AO
1113	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1114	tcp_rsk_used_ao(req)) {
1115	const union tcp_md5_addr *addr;
1116	const struct tcp_ao_hdr *aoh;
1117	int l3index;
1118
1119	/ Invalid TCP option size or twice included auth /
1120	if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh))
1121	return;
1122	if (!aoh)
1123	return;
1124
1125	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1126	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
1127	key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1128	sndid: aoh->rnext_keyid, rcvid: -`1`);
1129	if (unlikely(!key.ao_key)) {
1130	/ Send ACK with any matching MKT for the peer /
1131	key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid: -`1`, rcvid: -`1`);
1132	/ Matching key disappeared (user removed the key?)*
1133	* let the handshake timeout.
1134	*/
1135	if (!key.ao_key) {
1136	net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1137	addr,
1138	ntohs(tcp_hdr(skb)->source),
1139	&ip_hdr(skb)->daddr,
1140	ntohs(tcp_hdr(skb)->dest));
1141	return;
1142	}
1143	}
1144	key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1145	if (!key.traffic_key)
1146	return;
1147
1148	key.type = TCP_KEY_AO;
1149	key.rcv_next = aoh->keyid;
1150	tcp_v4_ao_calc_key_rsk(mkt: key.ao_key, key: key.traffic_key, req);
1151	#else
1152	if (`0`) {
1153	#endif
1154	} else if (static_branch_tcp_md5()) {
1155	const union tcp_md5_addr *addr;
1156	int l3index;
1157
1158	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1159	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
1160	key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1161	if (key.md5_key)
1162	key.type = TCP_KEY_MD5;
1163	}
1164
1165	/ Cleaning ECN bits of TW ACKs of oow data or is paws_reject /
1166	tcp_v4_send_ack(sk, skb, seq,
1167	ack: tcp_rsk(req)->rcv_nxt,
1168	win: tcp_synack_window(req) >> inet_rsk(sk: req)->rcv_wscale,
1169	tsval: tcp_rsk_tsval(treq: tcp_rsk(req)),
1170	tsecr: req->ts_recent,
1171	oif: `0`, key: &key,
1172	reply_flags: inet_rsk(sk: req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : `0`,
1173	tos: ip_hdr(skb)->tos & ~INET_ECN_MASK,
1174	READ_ONCE(tcp_rsk(req)->txhash));
1175	if (tcp_key_is_ao(key: &key))
1176	kfree(objp: key.traffic_key);
1177	}
1178
1179	/*
1180	* Send a SYN-ACK after having received a SYN.
1181	* This still operates on a request_sock only, not on a big
1182	* socket.
1183	*/
1184	static int tcp_v4_send_synack(const struct sock sk, struct* dst_entry *dst,
1185	struct flowi *fl,
1186	struct request_sock *req,
1187	struct tcp_fastopen_cookie *foc,
1188	enum tcp_synack_type synack_type,
1189	struct sk_buff *syn_skb)
1190	{
1191	struct inet_request_sock *ireq = inet_rsk(sk: req);
1192	struct flowi4 fl4;
1193	int err = -`1`;
1194	struct sk_buff *skb;
1195	u8 tos;
1196
1197	/ First, grab a route. /
1198	if (!dst && (dst = inet_csk_route_req(sk, fl4: &fl4, req)) == NULL)
1199	return -`1`;
1200
1201	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1202
1203	if (skb) {
1204	tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1205	__tcp_v4_send_check(skb, saddr: ireq->ir_loc_addr, daddr: ireq->ir_rmt_addr);
1206
1207	tos = READ_ONCE(inet_sk(sk)->tos);
1208
1209	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1210	tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) \|
1211	(tos & INET_ECN_MASK);
1212
1213	if (!INET_ECN_is_capable(dsfield: tos) &&
1214	tcp_bpf_ca_needs_ecn(sk: (struct sock *)req))
1215	tos \|= INET_ECN_ECT_0;
1216
1217	rcu_read_lock();
1218	err = ip_build_and_send_pkt(skb, sk, saddr: ireq->ir_loc_addr,
1219	daddr: ireq->ir_rmt_addr,
1220	rcu_dereference(ireq->ireq_opt),
1221	tos);
1222	rcu_read_unlock();
1223	err = net_xmit_eval(err);
1224	}
1225
1226	return err;
1227	}
1228
1229	/*
1230	* IPv4 request_sock destructor.
1231	*/
1232	static void tcp_v4_reqsk_destructor(struct request_sock *req)
1233	{
1234	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, `1`));
1235	}
1236
1237	#ifdef CONFIG_TCP_MD5SIG
1238	/*
1239	* RFC2385 MD5 checksumming requires a mapping of
1240	* IP address->MD5 Key.
1241	* We need to maintain these in the sk structure.
1242	*/
1243
1244	DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1245	EXPORT_IPV6_MOD(tcp_md5_needed);
1246
1247	static bool better_md5_match(struct tcp_md5sig_key old, struct* tcp_md5sig_key *new)
1248	{
1249	if (!old)
1250	return true;
1251
1252	/ l3index always overrides non-l3index /
1253	if (old->l3index && new->l3index == `0`)
1254	return false;
1255	if (old->l3index == `0` && new->l3index)
1256	return true;
1257
1258	return old->prefixlen < new->prefixlen;
1259	}
1260
1261	/ Find the Key structure for an address. /
1262	struct tcp_md5sig_key __tcp_md5_do_lookup(const* struct sock sk, int* l3index,
1263	const union tcp_md5_addr *addr,
1264	int family, bool any_l3index)
1265	{
1266	const struct tcp_sock *tp = tcp_sk(sk);
1267	struct tcp_md5sig_key *key;
1268	const struct tcp_md5sig_info *md5sig;
1269	__be32 mask;
1270	struct tcp_md5sig_key *best_match = NULL;
1271	bool match;
1272
1273	/ caller either holds rcu_read_lock() or socket lock /
1274	md5sig = rcu_dereference_check(tp->md5sig_info,
1275	lockdep_sock_is_held(sk));
1276	if (!md5sig)
1277	return NULL;
1278
1279	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1280	lockdep_sock_is_held(sk)) {
1281	if (key->family != family)
1282	continue;
1283	if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1284	key->l3index != l3index)
1285	continue;
1286	if (family == AF_INET) {
1287	mask = inet_make_mask(logmask: key->prefixlen);
1288	match = (key->addr.a4.s_addr & mask) ==
1289	(addr->a4.s_addr & mask);
1290	#if IS_ENABLED(CONFIG_IPV6)
1291	} else if (family == AF_INET6) {
1292	match = ipv6_prefix_equal(addr1: &key->addr.a6, addr2: &addr->a6,
1293	prefixlen: key->prefixlen);
1294	#endif
1295	} else {
1296	match = false;
1297	}
1298
1299	if (match && better_md5_match(old: best_match, new: key))
1300	best_match = key;
1301	}
1302	return best_match;
1303	}
1304	EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1305
1306	static struct tcp_md5sig_key tcp_md5_do_lookup_exact(const* struct sock *sk,
1307	const union tcp_md5_addr *addr,
1308	int family, u8 prefixlen,
1309	int l3index, u8 flags)
1310	{
1311	const struct tcp_sock *tp = tcp_sk(sk);
1312	struct tcp_md5sig_key *key;
1313	unsigned int size = sizeof(struct in_addr);
1314	const struct tcp_md5sig_info *md5sig;
1315
1316	/ caller either holds rcu_read_lock() or socket lock /
1317	md5sig = rcu_dereference_check(tp->md5sig_info,
1318	lockdep_sock_is_held(sk));
1319	if (!md5sig)
1320	return NULL;
1321	#if IS_ENABLED(CONFIG_IPV6)
1322	if (family == AF_INET6)
1323	size = sizeof(struct in6_addr);
1324	#endif
1325	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1326	lockdep_sock_is_held(sk)) {
1327	if (key->family != family)
1328	continue;
1329	if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1330	continue;
1331	if (key->l3index != l3index)
1332	continue;
1333	if (!memcmp(p: &key->addr, q: addr, size) &&
1334	key->prefixlen == prefixlen)
1335	return key;
1336	}
1337	return NULL;
1338	}
1339
1340	struct tcp_md5sig_key tcp_v4_md5_lookup(const* struct sock *sk,
1341	const struct sock *addr_sk)
1342	{
1343	const union tcp_md5_addr *addr;
1344	int l3index;
1345
1346	l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk),
1347	ifindex: addr_sk->sk_bound_dev_if);
1348	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1349	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1350	}
1351	EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1352
1353	static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1354	{
1355	struct tcp_sock *tp = tcp_sk(sk);
1356	struct tcp_md5sig_info *md5sig;
1357
1358	md5sig = kmalloc(sizeof(*md5sig), gfp);
1359	if (!md5sig)
1360	return -ENOMEM;
1361
1362	sk_gso_disable(sk);
1363	INIT_HLIST_HEAD(&md5sig->head);
1364	rcu_assign_pointer(tp->md5sig_info, md5sig);
1365	return `0`;
1366	}
1367
1368	/ This can be called on a newly created socket, from other files /
1369	static int __tcp_md5_do_add(struct sock sk, const* union tcp_md5_addr *addr,
1370	int family, u8 prefixlen, int l3index, u8 flags,
1371	const u8 *newkey, u8 newkeylen, gfp_t gfp)
1372	{
1373	/ Add Key to the list /
1374	struct tcp_md5sig_key *key;
1375	struct tcp_sock *tp = tcp_sk(sk);
1376	struct tcp_md5sig_info *md5sig;
1377
1378	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1379	if (key) {
1380	/ Pre-existing entry - just update that one.*
1381	* Note that the key might be used concurrently.
1382	* data_race() is telling kcsan that we do not care of
1383	* key mismatches, since changing MD5 key on live flows
1384	* can lead to packet drops.
1385	*/
1386	data_race(memcpy(key->key, newkey, newkeylen));
1387
1388	/ Pairs with READ_ONCE() in tcp_md5_hash_key().*
1389	* Also note that a reader could catch new key->keylen value
1390	* but old key->key[], this is the reason we use __GFP_ZERO
1391	* at sock_kmalloc() time below these lines.
1392	*/
1393	WRITE_ONCE(key->keylen, newkeylen);
1394
1395	return `0`;
1396	}
1397
1398	md5sig = rcu_dereference_protected(tp->md5sig_info,
1399	lockdep_sock_is_held(sk));
1400
1401	key = sock_kmalloc(sk, size: sizeof(*key), priority: gfp \| __GFP_ZERO);
1402	if (!key)
1403	return -ENOMEM;
1404
1405	memcpy(key->key, newkey, newkeylen);
1406	key->keylen = newkeylen;
1407	key->family = family;
1408	key->prefixlen = prefixlen;
1409	key->l3index = l3index;
1410	key->flags = flags;
1411	memcpy(&key->addr, addr,
1412	(IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1413	sizeof(struct in_addr));
1414	hlist_add_head_rcu(n: &key->node, h: &md5sig->head);
1415	return `0`;
1416	}
1417
1418	int tcp_md5_do_add(struct sock sk, const* union tcp_md5_addr *addr,
1419	int family, u8 prefixlen, int l3index, u8 flags,
1420	const u8 *newkey, u8 newkeylen)
1421	{
1422	struct tcp_sock *tp = tcp_sk(sk);
1423
1424	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1425	if (fips_enabled) {
1426	pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1427	return -EOPNOTSUPP;
1428	}
1429
1430	if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1431	return -ENOMEM;
1432
1433	if (!static_branch_inc(&tcp_md5_needed.key)) {
1434	struct tcp_md5sig_info *md5sig;
1435
1436	md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1437	rcu_assign_pointer(tp->md5sig_info, NULL);
1438	kfree_rcu(md5sig, rcu);
1439	return -EUSERS;
1440	}
1441	}
1442
1443	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1444	newkey, newkeylen, GFP_KERNEL);
1445	}
1446	EXPORT_IPV6_MOD(tcp_md5_do_add);
1447
1448	int tcp_md5_key_copy(struct sock sk, const* union tcp_md5_addr *addr,
1449	int family, u8 prefixlen, int l3index,
1450	struct tcp_md5sig_key *key)
1451	{
1452	struct tcp_sock *tp = tcp_sk(sk);
1453
1454	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1455
1456	if (tcp_md5sig_info_add(sk, gfp: sk_gfp_mask(sk, GFP_ATOMIC)))
1457	return -ENOMEM;
1458
1459	if (!static_key_fast_inc_not_disabled(key: &tcp_md5_needed.key.key)) {
1460	struct tcp_md5sig_info *md5sig;
1461
1462	md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1463	net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1464	rcu_assign_pointer(tp->md5sig_info, NULL);
1465	kfree_rcu(md5sig, rcu);
1466	return -EUSERS;
1467	}
1468	}
1469
1470	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1471	flags: key->flags, newkey: key->key, newkeylen: key->keylen,
1472	gfp: sk_gfp_mask(sk, GFP_ATOMIC));
1473	}
1474	EXPORT_IPV6_MOD(tcp_md5_key_copy);
1475
1476	int tcp_md5_do_del(struct sock sk, const* union tcp_md5_addr addr, int* family,
1477	u8 prefixlen, int l3index, u8 flags)
1478	{
1479	struct tcp_md5sig_key *key;
1480
1481	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1482	if (!key)
1483	return -ENOENT;
1484	hlist_del_rcu(n: &key->node);
1485	atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1486	kfree_rcu(key, rcu);
1487	return `0`;
1488	}
1489	EXPORT_IPV6_MOD(tcp_md5_do_del);
1490
1491	void tcp_clear_md5_list(struct sock *sk)
1492	{
1493	struct tcp_sock *tp = tcp_sk(sk);
1494	struct tcp_md5sig_key *key;
1495	struct hlist_node *n;
1496	struct tcp_md5sig_info *md5sig;
1497
1498	md5sig = rcu_dereference_protected(tp->md5sig_info, `1`);
1499
1500	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1501	hlist_del(n: &key->node);
1502	atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1503	kfree(objp: key);
1504	}
1505	}
1506
1507	static int tcp_v4_parse_md5_keys(struct sock sk, int* optname,
1508	sockptr_t optval, int optlen)
1509	{
1510	struct tcp_md5sig cmd;
1511	struct sockaddr_in sin = (struct* sockaddr_in *)&cmd.tcpm_addr;
1512	const union tcp_md5_addr *addr;
1513	u8 prefixlen = `32`;
1514	int l3index = `0`;
1515	bool l3flag;
1516	u8 flags;
1517
1518	if (optlen < sizeof(cmd))
1519	return -EINVAL;
1520
1521	if (copy_from_sockptr(dst: &cmd, src: optval, size: sizeof(cmd)))
1522	return -EFAULT;
1523
1524	if (sin->sin_family != AF_INET)
1525	return -EINVAL;
1526
1527	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1528	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1529
1530	if (optname == TCP_MD5SIG_EXT &&
1531	cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1532	prefixlen = cmd.tcpm_prefixlen;
1533	if (prefixlen > `32`)
1534	return -EINVAL;
1535	}
1536
1537	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1538	cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1539	struct net_device *dev;
1540
1541	rcu_read_lock();
1542	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: cmd.tcpm_ifindex);
1543	if (dev && netif_is_l3_master(dev))
1544	l3index = dev->ifindex;
1545
1546	rcu_read_unlock();
1547
1548	/ ok to reference set/not set outside of rcu;*
1549	* right now device MUST be an L3 master
1550	*/
1551	if (!dev \|\| !l3index)
1552	return -EINVAL;
1553	}
1554
1555	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1556
1557	if (!cmd.tcpm_keylen)
1558	return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1559
1560	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1561	return -EINVAL;
1562
1563	/ Don't allow keys for peers that have a matching TCP-AO key.*
1564	* See the comment in tcp_ao_add_cmd()
1565	*/
1566	if (tcp_ao_required(sk, saddr: addr, AF_INET, l3index: l3flag ? l3index : -`1`, stat_inc: false))
1567	return -EKEYREJECTED;
1568
1569	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1570	newkey: cmd.tcpm_key, newkeylen: cmd.tcpm_keylen);
1571	}
1572
1573	static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1574	__be32 daddr, __be32 saddr,
1575	const struct tcphdr th, int* nbytes)
1576	{
1577	struct {
1578	struct tcp4_pseudohdr ip;
1579	struct tcphdr tcp;
1580	} h;
1581
1582	h.ip.saddr = saddr;
1583	h.ip.daddr = daddr;
1584	h.ip.pad = `0`;
1585	h.ip.protocol = IPPROTO_TCP;
1586	h.ip.len = cpu_to_be16(nbytes);
1587	h.tcp = *th;
1588	h.tcp.check = `0`;
1589	md5_update(ctx, data: (const u8 )&h, len: sizeof(h.ip) + sizeof*(h.tcp));
1590	}
1591
1592	static noinline_for_stack void
1593	tcp_v4_md5_hash_hdr(char md5_hash, const* struct tcp_md5sig_key *key,
1594	__be32 daddr, __be32 saddr, const struct tcphdr *th)
1595	{
1596	struct md5_ctx ctx;
1597
1598	md5_init(ctx: &ctx);
1599	tcp_v4_md5_hash_headers(ctx: &ctx, daddr, saddr, th, nbytes: th->doff << `2`);
1600	tcp_md5_hash_key(ctx: &ctx, key);
1601	md5_final(ctx: &ctx, out: md5_hash);
1602	}
1603
1604	noinline_for_stack void
1605	tcp_v4_md5_hash_skb(char md5_hash, const* struct tcp_md5sig_key *key,
1606	const struct sock sk, const* struct sk_buff *skb)
1607	{
1608	const struct tcphdr *th = tcp_hdr(skb);
1609	__be32 saddr, daddr;
1610	struct md5_ctx ctx;
1611
1612	if (sk) { / valid for establish/request sockets /
1613	saddr = sk->sk_rcv_saddr;
1614	daddr = sk->sk_daddr;
1615	} else {
1616	const struct iphdr *iph = ip_hdr(skb);
1617	saddr = iph->saddr;
1618	daddr = iph->daddr;
1619	}
1620
1621	md5_init(ctx: &ctx);
1622	tcp_v4_md5_hash_headers(ctx: &ctx, daddr, saddr, th, nbytes: skb->len);
1623	tcp_md5_hash_skb_data(ctx: &ctx, skb, header_len: th->doff << `2`);
1624	tcp_md5_hash_key(ctx: &ctx, key);
1625	md5_final(ctx: &ctx, out: md5_hash);
1626	}
1627	EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1628
1629	#endif
1630
1631	static void tcp_v4_init_req(struct request_sock *req,
1632	const struct sock *sk_listener,
1633	struct sk_buff *skb)
1634	{
1635	struct inet_request_sock *ireq = inet_rsk(sk: req);
1636	struct net *net = sock_net(sk: sk_listener);
1637
1638	sk_rcv_saddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->daddr);
1639	sk_daddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->saddr);
1640	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1641	}
1642
1643	static struct dst_entry tcp_v4_route_req(const* struct sock *sk,
1644	struct sk_buff *skb,
1645	struct flowi *fl,
1646	struct request_sock *req,
1647	u32 tw_isn)
1648	{
1649	tcp_v4_init_req(req, sk_listener: sk, skb);
1650
1651	if (security_inet_conn_request(sk, skb, req))
1652	return NULL;
1653
1654	return inet_csk_route_req(sk, fl4: &fl->u.ip4, req);
1655	}
1656
1657	struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1658	.family = PF_INET,
1659	.obj_size = sizeof(struct tcp_request_sock),
1660	.send_ack = tcp_v4_reqsk_send_ack,
1661	.destructor = tcp_v4_reqsk_destructor,
1662	.send_reset = tcp_v4_send_reset,
1663	};
1664
1665	const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1666	.mss_clamp = TCP_MSS_DEFAULT,
1667	#ifdef CONFIG_TCP_MD5SIG
1668	.req_md5_lookup = tcp_v4_md5_lookup,
1669	.calc_md5_hash = tcp_v4_md5_hash_skb,
1670	#endif
1671	#ifdef CONFIG_TCP_AO
1672	.ao_lookup = tcp_v4_ao_lookup_rsk,
1673	.ao_calc_key = tcp_v4_ao_calc_key_rsk,
1674	.ao_synack_hash = tcp_v4_ao_synack_hash,
1675	#endif
1676	#ifdef CONFIG_SYN_COOKIES
1677	.cookie_init_seq = cookie_v4_init_sequence,
1678	#endif
1679	.route_req = tcp_v4_route_req,
1680	.init_seq = tcp_v4_init_seq,
1681	.init_ts_off = tcp_v4_init_ts_off,
1682	.send_synack = tcp_v4_send_synack,
1683	};
1684
1685	int tcp_v4_conn_request(struct sock sk, struct* sk_buff *skb)
1686	{
1687	/ Never answer to SYNs send to broadcast or multicast /
1688	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST))
1689	goto drop;
1690
1691	return tcp_conn_request(rsk_ops: &tcp_request_sock_ops,
1692	af_ops: &tcp_request_sock_ipv4_ops, sk, skb);
1693
1694	drop:
1695	tcp_listendrop(sk);
1696	return `0`;
1697	}
1698	EXPORT_IPV6_MOD(tcp_v4_conn_request);
1699
1700
1701	/*
1702	* The three way handshake has completed - we got a valid synack -
1703	* now create the new socket.
1704	*/
1705	struct sock tcp_v4_syn_recv_sock(const* struct sock sk, struct* sk_buff *skb,
1706	struct request_sock *req,
1707	struct dst_entry *dst,
1708	struct request_sock *req_unhash,
1709	bool *own_req)
1710	{
1711	struct inet_request_sock *ireq;
1712	bool found_dup_sk = false;
1713	struct inet_sock *newinet;
1714	struct tcp_sock *newtp;
1715	struct sock *newsk;
1716	#ifdef CONFIG_TCP_MD5SIG
1717	const union tcp_md5_addr *addr;
1718	struct tcp_md5sig_key *key;
1719	int l3index;
1720	#endif
1721	struct ip_options_rcu *inet_opt;
1722
1723	if (sk_acceptq_is_full(sk))
1724	goto exit_overflow;
1725
1726	newsk = tcp_create_openreq_child(sk, req, skb);
1727	if (!newsk)
1728	goto exit_nonewsk;
1729
1730	newsk->sk_gso_type = SKB_GSO_TCPV4;
1731	inet_sk_rx_dst_set(sk: newsk, skb);
1732
1733	newtp = tcp_sk(newsk);
1734	newinet = inet_sk(newsk);
1735	ireq = inet_rsk(sk: req);
1736	inet_opt = rcu_dereference(ireq->ireq_opt);
1737	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1738	newinet->mc_index = inet_iif(skb);
1739	newinet->mc_ttl = ip_hdr(skb)->ttl;
1740	newinet->rcv_tos = ip_hdr(skb)->tos;
1741	inet_csk(newsk)->icsk_ext_hdr_len = `0`;
1742	if (inet_opt)
1743	inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1744	atomic_set(v: &newinet->inet_id, i: get_random_u16());
1745
1746	/ Set ToS of the new socket based upon the value of incoming SYN.*
1747	* ECT bits are set later in tcp_init_transfer().
1748	*/
1749	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1750	newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1751
1752	if (!dst) {
1753	dst = inet_csk_route_child_sock(sk, newsk, req);
1754	if (!dst)
1755	goto put_and_exit;
1756	} else {
1757	/ syncookie case : see end of cookie_v4_check() /
1758	}
1759	sk_setup_caps(sk: newsk, dst);
1760
1761	tcp_ca_openreq_child(sk: newsk, dst);
1762
1763	tcp_sync_mss(sk: newsk, pmtu: dst_mtu(dst));
1764	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), mss: dst_metric_advmss(dst));
1765
1766	tcp_initialize_rcv_mss(sk: newsk);
1767
1768	#ifdef CONFIG_TCP_MD5SIG
1769	l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk), ifindex: ireq->ir_iif);
1770	/ Copy over the MD5 key from the original socket /
1771	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1772	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1773	if (key && !tcp_rsk_used_ao(req)) {
1774	if (tcp_md5_key_copy(sk: newsk, addr, AF_INET, prefixlen: `32`, l3index, key))
1775	goto put_and_exit;
1776	sk_gso_disable(sk: newsk);
1777	}
1778	#endif
1779	#ifdef CONFIG_TCP_AO
1780	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1781	goto put_and_exit; / OOM, release back memory /
1782	#endif
1783
1784	if (__inet_inherit_port(sk, child: newsk) < `0`)
1785	goto put_and_exit;
1786	*own_req = inet_ehash_nolisten(sk: newsk, osk: req_to_sk(req: req_unhash),
1787	found_dup_sk: &found_dup_sk);
1788	if (likely(*own_req)) {
1789	tcp_move_syn(tp: newtp, req);
1790	ireq->ireq_opt = NULL;
1791	} else {
1792	newinet->inet_opt = NULL;
1793
1794	if (!req_unhash && found_dup_sk) {
1795	/ This code path should only be executed in the*
1796	* syncookie case only
1797	*/
1798	bh_unlock_sock(newsk);
1799	sock_put(sk: newsk);
1800	newsk = NULL;
1801	}
1802	}
1803	return newsk;
1804
1805	exit_overflow:
1806	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1807	exit_nonewsk:
1808	dst_release(dst);
1809	exit:
1810	tcp_listendrop(sk);
1811	return NULL;
1812	put_and_exit:
1813	newinet->inet_opt = NULL;
1814	inet_csk_prepare_forced_close(sk: newsk);
1815	tcp_done(sk: newsk);
1816	goto exit;
1817	}
1818	EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1819
1820	static struct sock tcp_v4_cookie_check(struct* sock sk, struct* sk_buff *skb)
1821	{
1822	#ifdef CONFIG_SYN_COOKIES
1823	const struct tcphdr *th = tcp_hdr(skb);
1824
1825	if (!th->syn)
1826	sk = cookie_v4_check(sk, skb);
1827	#endif
1828	return sk;
1829	}
1830
1831	u16 tcp_v4_get_syncookie(struct sock sk, struct* iphdr *iph,
1832	struct tcphdr th, u32 cookie)
1833	{
1834	u16 mss = `0`;
1835	#ifdef CONFIG_SYN_COOKIES
1836	mss = tcp_get_syncookie_mss(rsk_ops: &tcp_request_sock_ops,
1837	af_ops: &tcp_request_sock_ipv4_ops, sk, th);
1838	if (mss) {
1839	*cookie = __cookie_v4_init_sequence(iph, th, mssp: &mss);
1840	tcp_synq_overflow(sk);
1841	}
1842	#endif
1843	return mss;
1844	}
1845
1846	INDIRECT_CALLABLE_DECLARE(struct dst_entry ipv4_dst_check(struct* dst_entry *,
1847	u32));
1848	/ The socket must have it's spinlock held when we get*
1849	* here, unless it is a TCP_LISTEN socket.
1850	*
1851	* We have a potential double-lock case here, so even when
1852	* doing backlog processing we use the BH locking scheme.
1853	* This is because we cannot sleep with the original spinlock
1854	* held.
1855	*/
1856	int tcp_v4_do_rcv(struct sock sk, struct* sk_buff *skb)
1857	{
1858	enum skb_drop_reason reason;
1859	struct sock *rsk;
1860
1861	reason = psp_sk_rx_policy_check(sk, skb);
1862	if (reason)
1863	goto err_discard;
1864
1865	if (sk->sk_state == TCP_ESTABLISHED) { / Fast path /
1866	struct dst_entry *dst;
1867
1868	dst = rcu_dereference_protected(sk->sk_rx_dst,
1869	lockdep_sock_is_held(sk));
1870
1871	sock_rps_save_rxhash(sk, skb);
1872	sk_mark_napi_id(sk, skb);
1873	if (dst) {
1874	if (sk->sk_rx_dst_ifindex != skb->skb_iif \|\|
1875	!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1876	dst, `0`)) {
1877	RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1878	dst_release(dst);
1879	}
1880	}
1881	tcp_rcv_established(sk, skb);
1882	return `0`;
1883	}
1884
1885	if (tcp_checksum_complete(skb))
1886	goto csum_err;
1887
1888	if (sk->sk_state == TCP_LISTEN) {
1889	struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1890
1891	if (!nsk)
1892	return `0`;
1893	if (nsk != sk) {
1894	reason = tcp_child_process(parent: sk, child: nsk, skb);
1895	if (reason) {
1896	rsk = nsk;
1897	goto reset;
1898	}
1899	return `0`;
1900	}
1901	} else
1902	sock_rps_save_rxhash(sk, skb);
1903
1904	reason = tcp_rcv_state_process(sk, skb);
1905	if (reason) {
1906	rsk = sk;
1907	goto reset;
1908	}
1909	return `0`;
1910
1911	reset:
1912	tcp_v4_send_reset(sk: rsk, skb, reason: sk_rst_convert_drop_reason(reason));
1913	discard:
1914	sk_skb_reason_drop(sk, skb, reason);
1915	/ Be careful here. If this function gets more complicated and*
1916	* gcc suffers from register pressure on the x86, sk (in %ebx)
1917	* might be destroyed here. This current version compiles correctly,
1918	* but you have been warned.
1919	*/
1920	return `0`;
1921
1922	csum_err:
1923	reason = SKB_DROP_REASON_TCP_CSUM;
1924	trace_tcp_bad_csum(skb);
1925	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1926	err_discard:
1927	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1928	goto discard;
1929	}
1930	EXPORT_SYMBOL(tcp_v4_do_rcv);
1931
1932	int tcp_v4_early_demux(struct sk_buff *skb)
1933	{
1934	struct net *net = dev_net_rcu(dev: skb->dev);
1935	const struct iphdr *iph;
1936	const struct tcphdr *th;
1937	struct sock *sk;
1938
1939	if (skb->pkt_type != PACKET_HOST)
1940	return `0`;
1941
1942	if (!pskb_may_pull(skb, len: skb_transport_offset(skb) + sizeof(struct tcphdr)))
1943	return `0`;
1944
1945	iph = ip_hdr(skb);
1946	th = tcp_hdr(skb);
1947
1948	if (th->doff < sizeof(struct tcphdr) / `4`)
1949	return `0`;
1950
1951	sk = __inet_lookup_established(net, saddr: iph->saddr, sport: th->source,
1952	daddr: iph->daddr, ntohs(th->dest),
1953	dif: skb->skb_iif, sdif: inet_sdif(skb));
1954	if (sk) {
1955	skb->sk = sk;
1956	skb->destructor = sock_edemux;
1957	if (sk_fullsock(sk)) {
1958	struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1959
1960	if (dst)
1961	dst = dst_check(dst, cookie: `0`);
1962	if (dst &&
1963	sk->sk_rx_dst_ifindex == skb->skb_iif)
1964	skb_dst_set_noref(skb, dst);
1965	}
1966	}
1967	return `0`;
1968	}
1969
1970	bool tcp_add_backlog(struct sock sk, struct* sk_buff *skb,
1971	enum skb_drop_reason *reason)
1972	{
1973	u32 tail_gso_size, tail_gso_segs;
1974	struct skb_shared_info *shinfo;
1975	const struct tcphdr *th;
1976	struct tcphdr *thtail;
1977	struct sk_buff *tail;
1978	unsigned int hdrlen;
1979	bool fragstolen;
1980	u32 gso_segs;
1981	u32 gso_size;
1982	u64 limit;
1983	int delta;
1984	int err;
1985
1986	/ In case all data was pulled from skb frags (in __pskb_pull_tail()),*
1987	* we can fix skb->truesize to its real value to avoid future drops.
1988	* This is valid because skb is not yet charged to the socket.
1989	* It has been noticed pure SACK packets were sometimes dropped
1990	* (if cooked by drivers without copybreak feature).
1991	*/
1992	skb_condense(skb);
1993
1994	tcp_cleanup_skb(skb);
1995
1996	if (unlikely(tcp_checksum_complete(skb))) {
1997	bh_unlock_sock(sk);
1998	trace_tcp_bad_csum(skb);
1999	*reason = SKB_DROP_REASON_TCP_CSUM;
2000	__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2001	__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2002	return true;
2003	}
2004
2005	/ Attempt coalescing to last skb in backlog, even if we are*
2006	* above the limits.
2007	* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2008	*/
2009	th = (const struct tcphdr *)skb->data;
2010	hdrlen = th->doff * `4`;
2011
2012	tail = sk->sk_backlog.tail;
2013	if (!tail)
2014	goto no_coalesce;
2015	thtail = (struct tcphdr *)tail->data;
2016
2017	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq \|\|
2018	TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield \|\|
2019	((TCP_SKB_CB(tail)->tcp_flags \|
2020	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN \| TCPHDR_RST \| TCPHDR_URG)) \|\|
2021	!((TCP_SKB_CB(tail)->tcp_flags &
2022	TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) \|\|
2023	((TCP_SKB_CB(tail)->tcp_flags ^
2024	TCP_SKB_CB(skb)->tcp_flags) &
2025	(TCPHDR_ECE \| TCPHDR_CWR \| TCPHDR_AE)) \|\|
2026	!tcp_skb_can_collapse_rx(to: tail, from: skb) \|\|
2027	thtail->doff != th->doff \|\|
2028	memcmp(p: thtail + `1`, q: th + `1`, size: hdrlen - sizeof(*th)) \|\|
2029	/ prior to PSP Rx policy check, retain exact PSP metadata /
2030	psp_skb_coalesce_diff(one: tail, two: skb))
2031	goto no_coalesce;
2032
2033	__skb_pull(skb, len: hdrlen);
2034
2035	shinfo = skb_shinfo(skb);
2036	gso_size = shinfo->gso_size ?: skb->len;
2037	gso_segs = shinfo->gso_segs ?: `1`;
2038
2039	shinfo = skb_shinfo(tail);
2040	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2041	tail_gso_segs = shinfo->gso_segs ?: `1`;
2042
2043	if (skb_try_coalesce(to: tail, from: skb, fragstolen: &fragstolen, delta_truesize: &delta)) {
2044	TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2045
2046	if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2047	TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2048	thtail->window = th->window;
2049	}
2050
2051	/ We have to update both TCP_SKB_CB(tail)->tcp_flags and*
2052	* thtail->fin, so that the fast path in tcp_rcv_established()
2053	* is not entered if we append a packet with a FIN.
2054	* SYN, RST, URG are not present.
2055	* ACK is set on both packets.
2056	* PSH : we do not really care in TCP stack,
2057	* at least for 'GRO' packets.
2058	*/
2059	thtail->fin \|= th->fin;
2060	TCP_SKB_CB(tail)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
2061
2062	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2063	TCP_SKB_CB(tail)->has_rxtstamp = true;
2064	tail->tstamp = skb->tstamp;
2065	skb_hwtstamps(skb: tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2066	}
2067
2068	/ Not as strict as GRO. We only need to carry mss max value /
2069	shinfo->gso_size = max(gso_size, tail_gso_size);
2070	shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, `0xFFFF`);
2071
2072	sk->sk_backlog.len += delta;
2073	__NET_INC_STATS(sock_net(sk),
2074	LINUX_MIB_TCPBACKLOGCOALESCE);
2075	kfree_skb_partial(skb, head_stolen: fragstolen);
2076	return false;
2077	}
2078	__skb_push(skb, len: hdrlen);
2079
2080	no_coalesce:
2081	/ sk->sk_backlog.len is reset only at the end of __release_sock().*
2082	* Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2083	* sk_rcvbuf in normal conditions.
2084	*/
2085	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << `1`;
2086
2087	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> `1`;
2088
2089	/ Only socket owner can try to collapse/prune rx queues*
2090	* to reduce memory overhead, so add a little headroom here.
2091	* Few sockets backlog are possibly concurrently non empty.
2092	*/
2093	limit += `64` * `1024`;
2094
2095	limit = min_t(u64, limit, UINT_MAX);
2096
2097	err = sk_add_backlog(sk, skb, limit);
2098	if (unlikely(err)) {
2099	bh_unlock_sock(sk);
2100	if (err == -ENOMEM) {
2101	*reason = SKB_DROP_REASON_PFMEMALLOC;
2102	__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2103	} else {
2104	*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2105	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2106	}
2107	return true;
2108	}
2109	return false;
2110	}
2111	EXPORT_IPV6_MOD(tcp_add_backlog);
2112
2113	int tcp_filter(struct sock sk, struct* sk_buff skb, enum* skb_drop_reason *reason)
2114	{
2115	struct tcphdr th = (struct* tcphdr *)skb->data;
2116
2117	return sk_filter_trim_cap(sk, skb, cap: th->doff * `4`, reason);
2118	}
2119	EXPORT_IPV6_MOD(tcp_filter);
2120
2121	static void tcp_v4_restore_cb(struct sk_buff *skb)
2122	{
2123	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2124	sizeof(struct inet_skb_parm));
2125	}
2126
2127	static void tcp_v4_fill_cb(struct sk_buff skb, const* struct iphdr *iph,
2128	const struct tcphdr *th)
2129	{
2130	/ This is tricky : We move IPCB at its correct location into TCP_SKB_CB()*
2131	* barrier() makes sure compiler wont play fool^Waliasing games.
2132	*/
2133	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2134	sizeof(struct inet_skb_parm));
2135	barrier();
2136
2137	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2138	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2139	skb->len - th->doff * `4`);
2140	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2141	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2142	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2143	TCP_SKB_CB(skb)->sacked = `0`;
2144	TCP_SKB_CB(skb)->has_rxtstamp =
2145	skb->tstamp \|\| skb_hwtstamps(skb)->hwtstamp;
2146	}
2147
2148	/*
2149	* From tcp_input.c
2150	*/
2151
2152	int tcp_v4_rcv(struct sk_buff *skb)
2153	{
2154	struct net *net = dev_net_rcu(dev: skb->dev);
2155	enum skb_drop_reason drop_reason;
2156	enum tcp_tw_status tw_status;
2157	int sdif = inet_sdif(skb);
2158	int dif = inet_iif(skb);
2159	const struct iphdr *iph;
2160	const struct tcphdr *th;
2161	struct sock *sk = NULL;
2162	bool refcounted;
2163	int ret;
2164	u32 isn;
2165
2166	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2167	if (skb->pkt_type != PACKET_HOST)
2168	goto discard_it;
2169
2170	/ Count it even if it's bad /
2171	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2172
2173	if (!pskb_may_pull(skb, len: sizeof(struct tcphdr)))
2174	goto discard_it;
2175
2176	th = (const struct tcphdr *)skb->data;
2177
2178	if (unlikely(th->doff < sizeof(struct tcphdr) / `4`)) {
2179	drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2180	goto bad_packet;
2181	}
2182	if (!pskb_may_pull(skb, len: th->doff * `4`))
2183	goto discard_it;
2184
2185	/ An explanation is required here, I think.*
2186	* Packet length and doff are validated by header prediction,
2187	* provided case of th->doff==0 is eliminated.
2188	* So, we defer the checks. */
2189
2190	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2191	goto csum_error;
2192
2193	th = (const struct tcphdr *)skb->data;
2194	iph = ip_hdr(skb);
2195	lookup:
2196	sk = __inet_lookup_skb(skb, doff: __tcp_hdrlen(th), sport: th->source,
2197	dport: th->dest, sdif, refcounted: &refcounted);
2198	if (!sk)
2199	goto no_tcp_socket;
2200
2201	if (sk->sk_state == TCP_TIME_WAIT)
2202	goto do_time_wait;
2203
2204	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2205	struct request_sock *req = inet_reqsk(sk);
2206	bool req_stolen = false;
2207	struct sock *nsk;
2208
2209	sk = req->rsk_listener;
2210	if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb))
2211	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2212	else
2213	drop_reason = tcp_inbound_hash(sk, req, skb,
2214	saddr: &iph->saddr, daddr: &iph->daddr,
2215	AF_INET, dif, sdif);
2216	if (unlikely(drop_reason)) {
2217	sk_drops_skbadd(sk, skb);
2218	reqsk_put(req);
2219	goto discard_it;
2220	}
2221	if (tcp_checksum_complete(skb)) {
2222	reqsk_put(req);
2223	goto csum_error;
2224	}
2225	if (unlikely(sk->sk_state != TCP_LISTEN)) {
2226	nsk = reuseport_migrate_sock(sk, migrating_sk: req_to_sk(req), skb);
2227	if (!nsk) {
2228	inet_csk_reqsk_queue_drop_and_put(sk, req);
2229	goto lookup;
2230	}
2231	sk = nsk;
2232	/ reuseport_migrate_sock() has already held one sk_refcnt*
2233	* before returning.
2234	*/
2235	} else {
2236	/ We own a reference on the listener, increase it again*
2237	* as we might lose it too soon.
2238	*/
2239	sock_hold(sk);
2240	}
2241	refcounted = true;
2242	nsk = NULL;
2243	if (!tcp_filter(sk, skb, reason: &drop_reason)) {
2244	th = (const struct tcphdr *)skb->data;
2245	iph = ip_hdr(skb);
2246	tcp_v4_fill_cb(skb, iph, th);
2247	nsk = tcp_check_req(sk, skb, req, fastopen: false, lost_race: &req_stolen,
2248	drop_reason: &drop_reason);
2249	}
2250	if (!nsk) {
2251	reqsk_put(req);
2252	if (req_stolen) {
2253	/ Another cpu got exclusive access to req*
2254	* and created a full blown socket.
2255	* Try to feed this packet to this socket
2256	* instead of discarding it.
2257	*/
2258	tcp_v4_restore_cb(skb);
2259	sock_put(sk);
2260	goto lookup;
2261	}
2262	goto discard_and_relse;
2263	}
2264	nf_reset_ct(skb);
2265	if (nsk == sk) {
2266	reqsk_put(req);
2267	tcp_v4_restore_cb(skb);
2268	} else {
2269	drop_reason = tcp_child_process(parent: sk, child: nsk, skb);
2270	if (drop_reason) {
2271	enum sk_rst_reason rst_reason;
2272
2273	rst_reason = sk_rst_convert_drop_reason(reason: drop_reason);
2274	tcp_v4_send_reset(sk: nsk, skb, reason: rst_reason);
2275	goto discard_and_relse;
2276	}
2277	sock_put(sk);
2278	return `0`;
2279	}
2280	}
2281
2282	process:
2283	if (static_branch_unlikely(&ip4_min_ttl)) {
2284	/ min_ttl can be changed concurrently from do_ip_setsockopt() /
2285	if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2286	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2287	drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2288	goto discard_and_relse;
2289	}
2290	}
2291
2292	if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb)) {
2293	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2294	goto discard_and_relse;
2295	}
2296
2297	drop_reason = tcp_inbound_hash(sk, NULL, skb, saddr: &iph->saddr, daddr: &iph->daddr,
2298	AF_INET, dif, sdif);
2299	if (drop_reason)
2300	goto discard_and_relse;
2301
2302	nf_reset_ct(skb);
2303
2304	if (tcp_filter(sk, skb, reason: &drop_reason))
2305	goto discard_and_relse;
2306
2307	th = (const struct tcphdr *)skb->data;
2308	iph = ip_hdr(skb);
2309	tcp_v4_fill_cb(skb, iph, th);
2310
2311	skb->dev = NULL;
2312
2313	if (sk->sk_state == TCP_LISTEN) {
2314	ret = tcp_v4_do_rcv(sk, skb);
2315	goto put_and_return;
2316	}
2317
2318	sk_incoming_cpu_update(sk);
2319
2320	bh_lock_sock_nested(sk);
2321	tcp_segs_in(tcp_sk(sk), skb);
2322	ret = `0`;
2323	if (!sock_owned_by_user(sk)) {
2324	ret = tcp_v4_do_rcv(sk, skb);
2325	} else {
2326	if (tcp_add_backlog(sk, skb, reason: &drop_reason))
2327	goto discard_and_relse;
2328	}
2329	bh_unlock_sock(sk);
2330
2331	put_and_return:
2332	if (refcounted)
2333	sock_put(sk);
2334
2335	return ret;
2336
2337	no_tcp_socket:
2338	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2339	if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb))
2340	goto discard_it;
2341
2342	tcp_v4_fill_cb(skb, iph, th);
2343
2344	if (tcp_checksum_complete(skb)) {
2345	csum_error:
2346	drop_reason = SKB_DROP_REASON_TCP_CSUM;
2347	trace_tcp_bad_csum(skb);
2348	__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2349	bad_packet:
2350	__TCP_INC_STATS(net, TCP_MIB_INERRS);
2351	} else {
2352	tcp_v4_send_reset(NULL, skb, reason: sk_rst_convert_drop_reason(reason: drop_reason));
2353	}
2354
2355	discard_it:
2356	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2357	/ Discard frame. /
2358	sk_skb_reason_drop(sk, skb, reason: drop_reason);
2359	return `0`;
2360
2361	discard_and_relse:
2362	sk_drops_skbadd(sk, skb);
2363	if (refcounted)
2364	sock_put(sk);
2365	goto discard_it;
2366
2367	do_time_wait:
2368	if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb)) {
2369	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2370	inet_twsk_put(tw: inet_twsk(sk));
2371	goto discard_it;
2372	}
2373
2374	tcp_v4_fill_cb(skb, iph, th);
2375
2376	if (tcp_checksum_complete(skb)) {
2377	inet_twsk_put(tw: inet_twsk(sk));
2378	goto csum_error;
2379	}
2380
2381	tw_status = tcp_timewait_state_process(tw: inet_twsk(sk), skb, th, tw_isn: &isn,
2382	drop_reason: &drop_reason);
2383	switch (tw_status) {
2384	case TCP_TW_SYN: {
2385	struct sock *sk2 = inet_lookup_listener(net, skb, doff: __tcp_hdrlen(th),
2386	saddr: iph->saddr, sport: th->source,
2387	daddr: iph->daddr, dport: th->dest,
2388	dif: inet_iif(skb),
2389	sdif);
2390	if (sk2) {
2391	inet_twsk_deschedule_put(tw: inet_twsk(sk));
2392	sk = sk2;
2393	tcp_v4_restore_cb(skb);
2394	refcounted = false;
2395	__this_cpu_write(tcp_tw_isn, isn);
2396	goto process;
2397	}
2398
2399	drop_reason = psp_twsk_rx_policy_check(tw: inet_twsk(sk), skb);
2400	if (drop_reason)
2401	break;
2402	}
2403	/ to ACK /
2404	fallthrough;
2405	case TCP_TW_ACK:
2406	case TCP_TW_ACK_OOW:
2407	tcp_v4_timewait_ack(sk, skb, tw_status);
2408	break;
2409	case TCP_TW_RST:
2410	tcp_v4_send_reset(sk, skb, reason: SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2411	inet_twsk_deschedule_put(tw: inet_twsk(sk));
2412	goto discard_it;
2413	case TCP_TW_SUCCESS:;
2414	}
2415	goto discard_it;
2416	}
2417
2418	static struct timewait_sock_ops tcp_timewait_sock_ops = {
2419	.twsk_obj_size = sizeof(struct tcp_timewait_sock),
2420	};
2421
2422	void inet_sk_rx_dst_set(struct sock sk, const* struct sk_buff *skb)
2423	{
2424	struct dst_entry *dst = skb_dst(skb);
2425
2426	if (dst && dst_hold_safe(dst)) {
2427	rcu_assign_pointer(sk->sk_rx_dst, dst);
2428	sk->sk_rx_dst_ifindex = skb->skb_iif;
2429	}
2430	}
2431	EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2432
2433	const struct inet_connection_sock_af_ops ipv4_specific = {
2434	.queue_xmit = ip_queue_xmit,
2435	.send_check = tcp_v4_send_check,
2436	.rebuild_header = inet_sk_rebuild_header,
2437	.sk_rx_dst_set = inet_sk_rx_dst_set,
2438	.conn_request = tcp_v4_conn_request,
2439	.syn_recv_sock = tcp_v4_syn_recv_sock,
2440	.net_header_len = sizeof(struct iphdr),
2441	.setsockopt = ip_setsockopt,
2442	.getsockopt = ip_getsockopt,
2443	.mtu_reduced = tcp_v4_mtu_reduced,
2444	};
2445	EXPORT_IPV6_MOD(ipv4_specific);
2446
2447	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
2448	static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2449	#ifdef CONFIG_TCP_MD5SIG
2450	.md5_lookup = tcp_v4_md5_lookup,
2451	.calc_md5_hash = tcp_v4_md5_hash_skb,
2452	.md5_parse = tcp_v4_parse_md5_keys,
2453	#endif
2454	#ifdef CONFIG_TCP_AO
2455	.ao_lookup = tcp_v4_ao_lookup,
2456	.calc_ao_hash = tcp_v4_ao_hash_skb,
2457	.ao_parse = tcp_v4_parse_ao,
2458	.ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2459	#endif
2460	};
2461
2462	static void tcp4_destruct_sock(struct sock *sk)
2463	{
2464	tcp_md5_destruct_sock(sk);
2465	tcp_ao_destroy_sock(sk, twsk: false);
2466	inet_sock_destruct(sk);
2467	}
2468	#endif
2469
2470	/ NOTE: A lot of things set to zero explicitly by call to*
2471	* sk_alloc() so need not be done here.
2472	*/
2473	static int tcp_v4_init_sock(struct sock *sk)
2474	{
2475	struct inet_connection_sock *icsk = inet_csk(sk);
2476
2477	tcp_init_sock(sk);
2478
2479	icsk->icsk_af_ops = &ipv4_specific;
2480
2481	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
2482	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2483	sk->sk_destruct = tcp4_destruct_sock;
2484	#endif
2485
2486	return `0`;
2487	}
2488
2489	static void tcp_release_user_frags(struct sock *sk)
2490	{
2491	#ifdef CONFIG_PAGE_POOL
2492	unsigned long index;
2493	void *netmem;
2494
2495	xa_for_each(&sk->sk_user_frags, index, netmem)
2496	WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2497	#endif
2498	}
2499
2500	void tcp_v4_destroy_sock(struct sock *sk)
2501	{
2502	struct tcp_sock *tp = tcp_sk(sk);
2503
2504	tcp_release_user_frags(sk);
2505
2506	xa_destroy(&sk->sk_user_frags);
2507
2508	trace_tcp_destroy_sock(sk);
2509
2510	tcp_clear_xmit_timers(sk);
2511
2512	tcp_cleanup_congestion_control(sk);
2513
2514	tcp_cleanup_ulp(sk);
2515
2516	/ Cleanup up the write buffer. /
2517	tcp_write_queue_purge(sk);
2518
2519	/ Check if we want to disable active TFO /
2520	tcp_fastopen_active_disable_ofo_check(sk);
2521
2522	/ Cleans up our, hopefully empty, out_of_order_queue. /
2523	skb_rbtree_purge(root: &tp->out_of_order_queue);
2524
2525	/ Clean up a referenced TCP bind bucket. /
2526	if (inet_csk(sk)->icsk_bind_hash)
2527	inet_put_port(sk);
2528
2529	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2530
2531	/ If socket is aborted during connect operation /
2532	tcp_free_fastopen_req(tp);
2533	tcp_fastopen_destroy_cipher(sk);
2534	tcp_saved_syn_free(tp);
2535
2536	sk_sockets_allocated_dec(sk);
2537	}
2538	EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2539
2540	#ifdef CONFIG_PROC_FS
2541	/ Proc filesystem TCP sock list dumping. /
2542
2543	static unsigned short seq_file_family(const struct seq_file *seq);
2544
2545	static bool seq_sk_match(struct seq_file seq, const* struct sock *sk)
2546	{
2547	unsigned short family = seq_file_family(seq);
2548
2549	/ AF_UNSPEC is used as a match all /
2550	return ((family == AF_UNSPEC \|\| family == sk->sk_family) &&
2551	net_eq(net1: sock_net(sk), net2: seq_file_net(seq)));
2552	}
2553
2554	/ Find a non empty bucket (starting from st->bucket)*
2555	* and return the first sk from it.
2556	*/
2557	static void listening_get_first(struct* seq_file *seq)
2558	{
2559	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2560	struct tcp_iter_state *st = seq->private;
2561
2562	st->offset = `0`;
2563	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2564	struct inet_listen_hashbucket *ilb2;
2565	struct hlist_nulls_node *node;
2566	struct sock *sk;
2567
2568	ilb2 = &hinfo->lhash2[st->bucket];
2569	if (hlist_nulls_empty(h: &ilb2->nulls_head))
2570	continue;
2571
2572	spin_lock(lock: &ilb2->lock);
2573	sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2574	if (seq_sk_match(seq, sk))
2575	return sk;
2576	}
2577	spin_unlock(lock: &ilb2->lock);
2578	}
2579
2580	return NULL;
2581	}
2582
2583	/ Find the next sk of "cur" within the same bucket (i.e. st->bucket).*
2584	* If "cur" is the last one in the st->bucket,
2585	* call listening_get_first() to return the first sk of the next
2586	* non empty bucket.
2587	*/
2588	static void listening_get_next(struct* seq_file seq, void* *cur)
2589	{
2590	struct tcp_iter_state *st = seq->private;
2591	struct inet_listen_hashbucket *ilb2;
2592	struct hlist_nulls_node *node;
2593	struct inet_hashinfo *hinfo;
2594	struct sock *sk = cur;
2595
2596	++st->num;
2597	++st->offset;
2598
2599	sk = sk_nulls_next(sk);
2600	sk_nulls_for_each_from(sk, node) {
2601	if (seq_sk_match(seq, sk))
2602	return sk;
2603	}
2604
2605	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2606	ilb2 = &hinfo->lhash2[st->bucket];
2607	spin_unlock(lock: &ilb2->lock);
2608	++st->bucket;
2609	return listening_get_first(seq);
2610	}
2611
2612	static void listening_get_idx(struct* seq_file seq, loff_t pos)
2613	{
2614	struct tcp_iter_state *st = seq->private;
2615	void *rc;
2616
2617	st->bucket = `0`;
2618	st->offset = `0`;
2619	rc = listening_get_first(seq);
2620
2621	while (rc && *pos) {
2622	rc = listening_get_next(seq, cur: rc);
2623	--*pos;
2624	}
2625	return rc;
2626	}
2627
2628	static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2629	const struct tcp_iter_state *st)
2630	{
2631	return hlist_nulls_empty(h: &hinfo->ehash[st->bucket].chain);
2632	}
2633
2634	/*
2635	* Get first established socket starting from bucket given in st->bucket.
2636	* If st->bucket is zero, the very first socket in the hash is returned.
2637	*/
2638	static void established_get_first(struct* seq_file *seq)
2639	{
2640	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2641	struct tcp_iter_state *st = seq->private;
2642
2643	st->offset = `0`;
2644	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2645	struct sock *sk;
2646	struct hlist_nulls_node *node;
2647	spinlock_t *lock = inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket);
2648
2649	cond_resched();
2650
2651	/ Lockless fast path for the common case of empty buckets /
2652	if (empty_bucket(hinfo, st))
2653	continue;
2654
2655	spin_lock_bh(lock);
2656	sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2657	if (seq_sk_match(seq, sk))
2658	return sk;
2659	}
2660	spin_unlock_bh(lock);
2661	}
2662
2663	return NULL;
2664	}
2665
2666	static void established_get_next(struct* seq_file seq, void* *cur)
2667	{
2668	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2669	struct tcp_iter_state *st = seq->private;
2670	struct hlist_nulls_node *node;
2671	struct sock *sk = cur;
2672
2673	++st->num;
2674	++st->offset;
2675
2676	sk = sk_nulls_next(sk);
2677
2678	sk_nulls_for_each_from(sk, node) {
2679	if (seq_sk_match(seq, sk))
2680	return sk;
2681	}
2682
2683	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2684	++st->bucket;
2685	return established_get_first(seq);
2686	}
2687
2688	static void established_get_idx(struct* seq_file *seq, loff_t pos)
2689	{
2690	struct tcp_iter_state *st = seq->private;
2691	void *rc;
2692
2693	st->bucket = `0`;
2694	rc = established_get_first(seq);
2695
2696	while (rc && pos) {
2697	rc = established_get_next(seq, cur: rc);
2698	--pos;
2699	}
2700	return rc;
2701	}
2702
2703	static void tcp_get_idx(struct* seq_file *seq, loff_t pos)
2704	{
2705	void *rc;
2706	struct tcp_iter_state *st = seq->private;
2707
2708	st->state = TCP_SEQ_STATE_LISTENING;
2709	rc = listening_get_idx(seq, pos: &pos);
2710
2711	if (!rc) {
2712	st->state = TCP_SEQ_STATE_ESTABLISHED;
2713	rc = established_get_idx(seq, pos);
2714	}
2715
2716	return rc;
2717	}
2718
2719	static void tcp_seek_last_pos(struct* seq_file *seq)
2720	{
2721	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2722	struct tcp_iter_state *st = seq->private;
2723	int bucket = st->bucket;
2724	int offset = st->offset;
2725	int orig_num = st->num;
2726	void *rc = NULL;
2727
2728	switch (st->state) {
2729	case TCP_SEQ_STATE_LISTENING:
2730	if (st->bucket > hinfo->lhash2_mask)
2731	break;
2732	rc = listening_get_first(seq);
2733	while (offset-- && rc && bucket == st->bucket)
2734	rc = listening_get_next(seq, cur: rc);
2735	if (rc)
2736	break;
2737	st->bucket = `0`;
2738	st->state = TCP_SEQ_STATE_ESTABLISHED;
2739	fallthrough;
2740	case TCP_SEQ_STATE_ESTABLISHED:
2741	if (st->bucket > hinfo->ehash_mask)
2742	break;
2743	rc = established_get_first(seq);
2744	while (offset-- && rc && bucket == st->bucket)
2745	rc = established_get_next(seq, cur: rc);
2746	}
2747
2748	st->num = orig_num;
2749
2750	return rc;
2751	}
2752
2753	void tcp_seq_start(struct* seq_file seq, loff_t pos)
2754	{
2755	struct tcp_iter_state *st = seq->private;
2756	void *rc;
2757
2758	if (pos && pos == st->last_pos) {
2759	rc = tcp_seek_last_pos(seq);
2760	if (rc)
2761	goto out;
2762	}
2763
2764	st->state = TCP_SEQ_STATE_LISTENING;
2765	st->num = `0`;
2766	st->bucket = `0`;
2767	st->offset = `0`;
2768	rc = pos ? tcp_get_idx(seq, pos: pos - `1`) : SEQ_START_TOKEN;
2769
2770	out:
2771	st->last_pos = *pos;
2772	return rc;
2773	}
2774	EXPORT_IPV6_MOD(tcp_seq_start);
2775
2776	void tcp_seq_next(struct* seq_file seq, void* v, loff_t pos)
2777	{
2778	struct tcp_iter_state *st = seq->private;
2779	void *rc = NULL;
2780
2781	if (v == SEQ_START_TOKEN) {
2782	rc = tcp_get_idx(seq, pos: `0`);
2783	goto out;
2784	}
2785
2786	switch (st->state) {
2787	case TCP_SEQ_STATE_LISTENING:
2788	rc = listening_get_next(seq, cur: v);
2789	if (!rc) {
2790	st->state = TCP_SEQ_STATE_ESTABLISHED;
2791	st->bucket = `0`;
2792	st->offset = `0`;
2793	rc = established_get_first(seq);
2794	}
2795	break;
2796	case TCP_SEQ_STATE_ESTABLISHED:
2797	rc = established_get_next(seq, cur: v);
2798	break;
2799	}
2800	out:
2801	++*pos;
2802	st->last_pos = *pos;
2803	return rc;
2804	}
2805	EXPORT_IPV6_MOD(tcp_seq_next);
2806
2807	void tcp_seq_stop(struct seq_file seq, void* *v)
2808	{
2809	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2810	struct tcp_iter_state *st = seq->private;
2811
2812	switch (st->state) {
2813	case TCP_SEQ_STATE_LISTENING:
2814	if (v != SEQ_START_TOKEN)
2815	spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
2816	break;
2817	case TCP_SEQ_STATE_ESTABLISHED:
2818	if (v)
2819	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2820	break;
2821	}
2822	}
2823	EXPORT_IPV6_MOD(tcp_seq_stop);
2824
2825	static void get_openreq4(const struct request_sock *req,
2826	struct seq_file f, int* i)
2827	{
2828	const struct inet_request_sock *ireq = inet_rsk(sk: req);
2829	long delta = req->rsk_timer.expires - jiffies;
2830
2831	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2832	" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2833	i,
2834	ireq->ir_loc_addr,
2835	ireq->ir_num,
2836	ireq->ir_rmt_addr,
2837	ntohs(ireq->ir_rmt_port),
2838	TCP_SYN_RECV,
2839	`0`, `0`, / could print option size, but that is af dependent. /
2840	`1`, / timers active (only the expire timer) /
2841	jiffies_delta_to_clock_t(delta),
2842	req->num_timeout,
2843	from_kuid_munged(to: seq_user_ns(seq: f),
2844	uid: sk_uid(sk: req->rsk_listener)),
2845	`0`, / non standard timer /
2846	`0`, / open_requests have no inode /
2847	`0`,
2848	req);
2849	}
2850
2851	static void get_tcp4_sock(struct sock sk, struct* seq_file f, int* i)
2852	{
2853	int timer_active;
2854	unsigned long timer_expires;
2855	const struct tcp_sock *tp = tcp_sk(sk);
2856	const struct inet_connection_sock *icsk = inet_csk(sk);
2857	const struct inet_sock *inet = inet_sk(sk);
2858	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2859	__be32 dest = inet->inet_daddr;
2860	__be32 src = inet->inet_rcv_saddr;
2861	__u16 destp = ntohs(inet->inet_dport);
2862	__u16 srcp = ntohs(inet->inet_sport);
2863	u8 icsk_pending;
2864	int rx_queue;
2865	int state;
2866
2867	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2868	if (icsk_pending == ICSK_TIME_RETRANS \|\|
2869	icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
2870	icsk_pending == ICSK_TIME_LOSS_PROBE) {
2871	timer_active = `1`;
2872	timer_expires = tcp_timeout_expires(sk);
2873	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2874	timer_active = `4`;
2875	timer_expires = tcp_timeout_expires(sk);
2876	} else if (timer_pending(timer: &icsk->icsk_keepalive_timer)) {
2877	timer_active = `2`;
2878	timer_expires = icsk->icsk_keepalive_timer.expires;
2879	} else {
2880	timer_active = `0`;
2881	timer_expires = jiffies;
2882	}
2883
2884	state = inet_sk_state_load(sk);
2885	if (state == TCP_LISTEN)
2886	rx_queue = READ_ONCE(sk->sk_ack_backlog);
2887	else
2888	/ Because we don't lock the socket,*
2889	* we might find a transient negative value.
2890	*/
2891	rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2892	READ_ONCE(tp->copied_seq), `0`);
2893
2894	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2895	"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2896	i, src, srcp, dest, destp, state,
2897	READ_ONCE(tp->write_seq) - tp->snd_una,
2898	rx_queue,
2899	timer_active,
2900	jiffies_delta_to_clock_t(delta: timer_expires - jiffies),
2901	READ_ONCE(icsk->icsk_retransmits),
2902	from_kuid_munged(to: seq_user_ns(seq: f), uid: sk_uid(sk)),
2903	READ_ONCE(icsk->icsk_probes_out),
2904	sock_i_ino(sk),
2905	refcount_read(r: &sk->sk_refcnt), sk,
2906	jiffies_to_clock_t(x: icsk->icsk_rto),
2907	jiffies_to_clock_t(x: icsk->icsk_ack.ato),
2908	(icsk->icsk_ack.quick << `1`) \| inet_csk_in_pingpong_mode(sk),
2909	tcp_snd_cwnd(tp),
2910	state == TCP_LISTEN ?
2911	fastopenq->max_qlen :
2912	(tcp_in_initial_slowstart(tp) ? -`1` : tp->snd_ssthresh));
2913	}
2914
2915	static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2916	struct seq_file f, int* i)
2917	{
2918	long delta = tw->tw_timer.expires - jiffies;
2919	__be32 dest, src;
2920	__u16 destp, srcp;
2921
2922	dest = tw->tw_daddr;
2923	src = tw->tw_rcv_saddr;
2924	destp = ntohs(tw->tw_dport);
2925	srcp = ntohs(tw->tw_sport);
2926
2927	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2928	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2929	i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), `0`, `0`,
2930	`3`, jiffies_delta_to_clock_t(delta), `0`, `0`, `0`, `0`,
2931	refcount_read(r: &tw->tw_refcnt), tw);
2932	}
2933
2934	#define TMPSZ 150
2935
2936	static int tcp4_seq_show(struct seq_file seq, void* *v)
2937	{
2938	struct tcp_iter_state *st;
2939	struct sock *sk = v;
2940
2941	seq_setwidth(m: seq, TMPSZ - `1`);
2942	if (v == SEQ_START_TOKEN) {
2943	seq_puts(m: seq, s: " sl local_address rem_address st tx_queue "
2944	"rx_queue tr tm->when retrnsmt uid timeout "
2945	"inode");
2946	goto out;
2947	}
2948	st = seq->private;
2949
2950	if (sk->sk_state == TCP_TIME_WAIT)
2951	get_timewait4_sock(tw: v, f: seq, i: st->num);
2952	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2953	get_openreq4(req: v, f: seq, i: st->num);
2954	else
2955	get_tcp4_sock(sk: v, f: seq, i: st->num);
2956	out:
2957	seq_pad(m: seq, c: `'\n'`);
2958	return `0`;
2959	}
2960
2961	#ifdef CONFIG_BPF_SYSCALL
2962	union bpf_tcp_iter_batch_item {
2963	struct sock *sk;
2964	__u64 cookie;
2965	};
2966
2967	struct bpf_tcp_iter_state {
2968	struct tcp_iter_state state;
2969	unsigned int cur_sk;
2970	unsigned int end_sk;
2971	unsigned int max_sk;
2972	union bpf_tcp_iter_batch_item *batch;
2973	};
2974
2975	struct bpf_iter__tcp {
2976	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2977	__bpf_md_ptr(struct sock_common *, sk_common);
2978	uid_t uid __aligned(`8`);
2979	};
2980
2981	static int tcp_prog_seq_show(struct bpf_prog prog, struct* bpf_iter_meta *meta,
2982	struct sock_common *sk_common, uid_t uid)
2983	{
2984	struct bpf_iter__tcp ctx;
2985
2986	meta->seq_num--; / skip SEQ_START_TOKEN /
2987	ctx.meta = meta;
2988	ctx.sk_common = sk_common;
2989	ctx.uid = uid;
2990	return bpf_iter_run_prog(prog, ctx: &ctx);
2991	}
2992
2993	static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2994	{
2995	union bpf_tcp_iter_batch_item *item;
2996	unsigned int cur_sk = iter->cur_sk;
2997	__u64 cookie;
2998
2999	/ Remember the cookies of the sockets we haven't seen yet, so we can*
3000	* pick up where we left off next time around.
3001	*/
3002	while (cur_sk < iter->end_sk) {
3003	item = &iter->batch[cur_sk++];
3004	cookie = sock_gen_cookie(sk: item->sk);
3005	sock_gen_put(sk: item->sk);
3006	item->cookie = cookie;
3007	}
3008	}
3009
3010	static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3011	unsigned int new_batch_sz, gfp_t flags)
3012	{
3013	union bpf_tcp_iter_batch_item *new_batch;
3014
3015	new_batch = kvmalloc(sizeof(new_batch) new_batch_sz,
3016	flags \| __GFP_NOWARN);
3017	if (!new_batch)
3018	return -ENOMEM;
3019
3020	memcpy(new_batch, iter->batch, sizeof(iter->batch) iter->end_sk);
3021	kvfree(addr: iter->batch);
3022	iter->batch = new_batch;
3023	iter->max_sk = new_batch_sz;
3024
3025	return `0`;
3026	}
3027
3028	static struct sock bpf_iter_tcp_resume_bucket(struct* sock *first_sk,
3029	union bpf_tcp_iter_batch_item *cookies,
3030	int n_cookies)
3031	{
3032	struct hlist_nulls_node *node;
3033	struct sock *sk;
3034	int i;
3035
3036	for (i = `0`; i < n_cookies; i++) {
3037	sk = first_sk;
3038	sk_nulls_for_each_from(sk, node)
3039	if (cookies[i].cookie == atomic64_read(v: &sk->sk_cookie))
3040	return sk;
3041	}
3042
3043	return NULL;
3044	}
3045
3046	static struct sock bpf_iter_tcp_resume_listening(struct* seq_file *seq)
3047	{
3048	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3049	struct bpf_tcp_iter_state *iter = seq->private;
3050	struct tcp_iter_state *st = &iter->state;
3051	unsigned int find_cookie = iter->cur_sk;
3052	unsigned int end_cookie = iter->end_sk;
3053	int resume_bucket = st->bucket;
3054	struct sock *sk;
3055
3056	if (end_cookie && find_cookie == end_cookie)
3057	++st->bucket;
3058
3059	sk = listening_get_first(seq);
3060	iter->cur_sk = `0`;
3061	iter->end_sk = `0`;
3062
3063	if (sk && st->bucket == resume_bucket && end_cookie) {
3064	sk = bpf_iter_tcp_resume_bucket(first_sk: sk, cookies: &iter->batch[find_cookie],
3065	n_cookies: end_cookie - find_cookie);
3066	if (!sk) {
3067	spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
3068	++st->bucket;
3069	sk = listening_get_first(seq);
3070	}
3071	}
3072
3073	return sk;
3074	}
3075
3076	static struct sock bpf_iter_tcp_resume_established(struct* seq_file *seq)
3077	{
3078	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3079	struct bpf_tcp_iter_state *iter = seq->private;
3080	struct tcp_iter_state *st = &iter->state;
3081	unsigned int find_cookie = iter->cur_sk;
3082	unsigned int end_cookie = iter->end_sk;
3083	int resume_bucket = st->bucket;
3084	struct sock *sk;
3085
3086	if (end_cookie && find_cookie == end_cookie)
3087	++st->bucket;
3088
3089	sk = established_get_first(seq);
3090	iter->cur_sk = `0`;
3091	iter->end_sk = `0`;
3092
3093	if (sk && st->bucket == resume_bucket && end_cookie) {
3094	sk = bpf_iter_tcp_resume_bucket(first_sk: sk, cookies: &iter->batch[find_cookie],
3095	n_cookies: end_cookie - find_cookie);
3096	if (!sk) {
3097	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
3098	++st->bucket;
3099	sk = established_get_first(seq);
3100	}
3101	}
3102
3103	return sk;
3104	}
3105
3106	static struct sock bpf_iter_tcp_resume(struct* seq_file *seq)
3107	{
3108	struct bpf_tcp_iter_state *iter = seq->private;
3109	struct tcp_iter_state *st = &iter->state;
3110	struct sock *sk = NULL;
3111
3112	switch (st->state) {
3113	case TCP_SEQ_STATE_LISTENING:
3114	sk = bpf_iter_tcp_resume_listening(seq);
3115	if (sk)
3116	break;
3117	st->bucket = `0`;
3118	st->state = TCP_SEQ_STATE_ESTABLISHED;
3119	fallthrough;
3120	case TCP_SEQ_STATE_ESTABLISHED:
3121	sk = bpf_iter_tcp_resume_established(seq);
3122	break;
3123	}
3124
3125	return sk;
3126	}
3127
3128	static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3129	struct sock **start_sk)
3130	{
3131	struct bpf_tcp_iter_state *iter = seq->private;
3132	struct hlist_nulls_node *node;
3133	unsigned int expected = `1`;
3134	struct sock *sk;
3135
3136	sock_hold(sk: *start_sk);
3137	iter->batch[iter->end_sk++].sk = *start_sk;
3138
3139	sk = sk_nulls_next(sk: *start_sk);
3140	*start_sk = NULL;
3141	sk_nulls_for_each_from(sk, node) {
3142	if (seq_sk_match(seq, sk)) {
3143	if (iter->end_sk < iter->max_sk) {
3144	sock_hold(sk);
3145	iter->batch[iter->end_sk++].sk = sk;
3146	} else if (!*start_sk) {
3147	/ Remember where we left off. /
3148	*start_sk = sk;
3149	}
3150	expected++;
3151	}
3152	}
3153
3154	return expected;
3155	}
3156
3157	static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3158	struct sock **start_sk)
3159	{
3160	struct bpf_tcp_iter_state *iter = seq->private;
3161	struct hlist_nulls_node *node;
3162	unsigned int expected = `1`;
3163	struct sock *sk;
3164
3165	sock_hold(sk: *start_sk);
3166	iter->batch[iter->end_sk++].sk = *start_sk;
3167
3168	sk = sk_nulls_next(sk: *start_sk);
3169	*start_sk = NULL;
3170	sk_nulls_for_each_from(sk, node) {
3171	if (seq_sk_match(seq, sk)) {
3172	if (iter->end_sk < iter->max_sk) {
3173	sock_hold(sk);
3174	iter->batch[iter->end_sk++].sk = sk;
3175	} else if (!*start_sk) {
3176	/ Remember where we left off. /
3177	*start_sk = sk;
3178	}
3179	expected++;
3180	}
3181	}
3182
3183	return expected;
3184	}
3185
3186	static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3187	struct sock **start_sk)
3188	{
3189	struct bpf_tcp_iter_state *iter = seq->private;
3190	struct tcp_iter_state *st = &iter->state;
3191
3192	if (st->state == TCP_SEQ_STATE_LISTENING)
3193	return bpf_iter_tcp_listening_batch(seq, start_sk);
3194	else
3195	return bpf_iter_tcp_established_batch(seq, start_sk);
3196	}
3197
3198	static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3199	{
3200	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3201	struct bpf_tcp_iter_state *iter = seq->private;
3202	struct tcp_iter_state *st = &iter->state;
3203
3204	if (st->state == TCP_SEQ_STATE_LISTENING)
3205	spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
3206	else
3207	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
3208	}
3209
3210	static struct sock bpf_iter_tcp_batch(struct* seq_file *seq)
3211	{
3212	struct bpf_tcp_iter_state *iter = seq->private;
3213	unsigned int expected;
3214	struct sock *sk;
3215	int err;
3216
3217	sk = bpf_iter_tcp_resume(seq);
3218	if (!sk)
3219	return NULL; / Done /
3220
3221	expected = bpf_iter_fill_batch(seq, start_sk: &sk);
3222	if (likely(iter->end_sk == expected))
3223	goto done;
3224
3225	/ Batch size was too small. /
3226	bpf_iter_tcp_unlock_bucket(seq);
3227	bpf_iter_tcp_put_batch(iter);
3228	err = bpf_iter_tcp_realloc_batch(iter, new_batch_sz: expected * `3` / `2`,
3229	GFP_USER);
3230	if (err)
3231	return ERR_PTR(error: err);
3232
3233	sk = bpf_iter_tcp_resume(seq);
3234	if (!sk)
3235	return NULL; / Done /
3236
3237	expected = bpf_iter_fill_batch(seq, start_sk: &sk);
3238	if (likely(iter->end_sk == expected))
3239	goto done;
3240
3241	/ Batch size was still too small. Hold onto the lock while we try*
3242	* again with a larger batch to make sure the current bucket's size
3243	* does not change in the meantime.
3244	*/
3245	err = bpf_iter_tcp_realloc_batch(iter, new_batch_sz: expected, GFP_NOWAIT);
3246	if (err) {
3247	bpf_iter_tcp_unlock_bucket(seq);
3248	return ERR_PTR(error: err);
3249	}
3250
3251	expected = bpf_iter_fill_batch(seq, start_sk: &sk);
3252	WARN_ON_ONCE(iter->end_sk != expected);
3253	done:
3254	bpf_iter_tcp_unlock_bucket(seq);
3255	return iter->batch[`0`].sk;
3256	}
3257
3258	static void bpf_iter_tcp_seq_start(struct* seq_file seq, loff_t pos)
3259	{
3260	/ bpf iter does not support lseek, so it always*
3261	* continue from where it was stop()-ped.
3262	*/
3263	if (*pos)
3264	return bpf_iter_tcp_batch(seq);
3265
3266	return SEQ_START_TOKEN;
3267	}
3268
3269	static void bpf_iter_tcp_seq_next(struct* seq_file seq, void* v, loff_t pos)
3270	{
3271	struct bpf_tcp_iter_state *iter = seq->private;
3272	struct tcp_iter_state *st = &iter->state;
3273	struct sock *sk;
3274
3275	/ Whenever seq_next() is called, the iter->cur_sk is*
3276	* done with seq_show(), so advance to the next sk in
3277	* the batch.
3278	*/
3279	if (iter->cur_sk < iter->end_sk) {
3280	/ Keeping st->num consistent in tcp_iter_state.*
3281	* bpf_iter_tcp does not use st->num.
3282	* meta.seq_num is used instead.
3283	*/
3284	st->num++;
3285	sock_gen_put(sk: iter->batch[iter->cur_sk++].sk);
3286	}
3287
3288	if (iter->cur_sk < iter->end_sk)
3289	sk = iter->batch[iter->cur_sk].sk;
3290	else
3291	sk = bpf_iter_tcp_batch(seq);
3292
3293	++*pos;
3294	/ Keeping st->last_pos consistent in tcp_iter_state.*
3295	* bpf iter does not do lseek, so st->last_pos always equals to *pos.
3296	*/
3297	st->last_pos = *pos;
3298	return sk;
3299	}
3300
3301	static int bpf_iter_tcp_seq_show(struct seq_file seq, void* *v)
3302	{
3303	struct bpf_iter_meta meta;
3304	struct bpf_prog *prog;
3305	struct sock *sk = v;
3306	uid_t uid;
3307	int ret;
3308
3309	if (v == SEQ_START_TOKEN)
3310	return `0`;
3311
3312	if (sk_fullsock(sk))
3313	lock_sock(sk);
3314
3315	if (unlikely(sk_unhashed(sk))) {
3316	ret = SEQ_SKIP;
3317	goto unlock;
3318	}
3319
3320	if (sk->sk_state == TCP_TIME_WAIT) {
3321	uid = `0`;
3322	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3323	const struct request_sock *req = v;
3324
3325	uid = from_kuid_munged(to: seq_user_ns(seq),
3326	uid: sk_uid(sk: req->rsk_listener));
3327	} else {
3328	uid = from_kuid_munged(to: seq_user_ns(seq), uid: sk_uid(sk));
3329	}
3330
3331	meta.seq = seq;
3332	prog = bpf_iter_get_info(meta: &meta, in_stop: false);
3333	ret = tcp_prog_seq_show(prog, meta: &meta, sk_common: v, uid);
3334
3335	unlock:
3336	if (sk_fullsock(sk))
3337	release_sock(sk);
3338	return ret;
3339
3340	}
3341
3342	static void bpf_iter_tcp_seq_stop(struct seq_file seq, void* *v)
3343	{
3344	struct bpf_tcp_iter_state *iter = seq->private;
3345	struct bpf_iter_meta meta;
3346	struct bpf_prog *prog;
3347
3348	if (!v) {
3349	meta.seq = seq;
3350	prog = bpf_iter_get_info(meta: &meta, in_stop: true);
3351	if (prog)
3352	(void)tcp_prog_seq_show(prog, meta: &meta, sk_common: v, uid: `0`);
3353	}
3354
3355	if (iter->cur_sk < iter->end_sk)
3356	bpf_iter_tcp_put_batch(iter);
3357	}
3358
3359	static const struct seq_operations bpf_iter_tcp_seq_ops = {
3360	.show = bpf_iter_tcp_seq_show,
3361	.start = bpf_iter_tcp_seq_start,
3362	.next = bpf_iter_tcp_seq_next,
3363	.stop = bpf_iter_tcp_seq_stop,
3364	};
3365	#endif
3366	static unsigned short seq_file_family(const struct seq_file *seq)
3367	{
3368	const struct tcp_seq_afinfo *afinfo;
3369
3370	#ifdef CONFIG_BPF_SYSCALL
3371	/ Iterated from bpf_iter. Let the bpf prog to filter instead. /
3372	if (seq->op == &bpf_iter_tcp_seq_ops)
3373	return AF_UNSPEC;
3374	#endif
3375
3376	/ Iterated from proc fs /
3377	afinfo = pde_data(inode: file_inode(f: seq->file));
3378	return afinfo->family;
3379	}
3380
3381	static const struct seq_operations tcp4_seq_ops = {
3382	.show = tcp4_seq_show,
3383	.start = tcp_seq_start,
3384	.next = tcp_seq_next,
3385	.stop = tcp_seq_stop,
3386	};
3387
3388	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3389	.family = AF_INET,
3390	};
3391
3392	static int __net_init tcp4_proc_init_net(struct net *net)
3393	{
3394	if (!proc_create_net_data(name: "tcp", mode: `0444`, parent: net->proc_net, ops: &tcp4_seq_ops,
3395	state_size: sizeof(struct tcp_iter_state), data: &tcp4_seq_afinfo))
3396	return -ENOMEM;
3397	return `0`;
3398	}
3399
3400	static void __net_exit tcp4_proc_exit_net(struct net *net)
3401	{
3402	remove_proc_entry("tcp", net->proc_net);
3403	}
3404
3405	static struct pernet_operations tcp4_net_ops = {
3406	.init = tcp4_proc_init_net,
3407	.exit = tcp4_proc_exit_net,
3408	};
3409
3410	int __init tcp4_proc_init(void)
3411	{
3412	return register_pernet_subsys(&tcp4_net_ops);
3413	}
3414
3415	void tcp4_proc_exit(void)
3416	{
3417	unregister_pernet_subsys(&tcp4_net_ops);
3418	}
3419	#endif /* CONFIG_PROC_FS */
3420
3421	/ @wake is one when sk_stream_write_space() calls us.*
3422	* This sends EPOLLOUT only if notsent_bytes is half the limit.
3423	* This mimics the strategy used in sock_def_write_space().
3424	*/
3425	bool tcp_stream_memory_free(const struct sock sk, int* wake)
3426	{
3427	const struct tcp_sock *tp = tcp_sk(sk);
3428	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3429	READ_ONCE(tp->snd_nxt);
3430
3431	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3432	}
3433	EXPORT_SYMBOL(tcp_stream_memory_free);
3434
3435	struct proto tcp_prot = {
3436	.name = "TCP",
3437	.owner = THIS_MODULE,
3438	.close = tcp_close,
3439	.pre_connect = tcp_v4_pre_connect,
3440	.connect = tcp_v4_connect,
3441	.disconnect = tcp_disconnect,
3442	.accept = inet_csk_accept,
3443	.ioctl = tcp_ioctl,
3444	.init = tcp_v4_init_sock,
3445	.destroy = tcp_v4_destroy_sock,
3446	.shutdown = tcp_shutdown,
3447	.setsockopt = tcp_setsockopt,
3448	.getsockopt = tcp_getsockopt,
3449	.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3450	.keepalive = tcp_set_keepalive,
3451	.recvmsg = tcp_recvmsg,
3452	.sendmsg = tcp_sendmsg,
3453	.splice_eof = tcp_splice_eof,
3454	.backlog_rcv = tcp_v4_do_rcv,
3455	.release_cb = tcp_release_cb,
3456	.hash = inet_hash,
3457	.unhash = inet_unhash,
3458	.get_port = inet_csk_get_port,
3459	.put_port = inet_put_port,
3460	#ifdef CONFIG_BPF_SYSCALL
3461	.psock_update_sk_prot = tcp_bpf_update_proto,
3462	#endif
3463	.enter_memory_pressure = tcp_enter_memory_pressure,
3464	.leave_memory_pressure = tcp_leave_memory_pressure,
3465	.stream_memory_free = tcp_stream_memory_free,
3466	.sockets_allocated = &tcp_sockets_allocated,
3467
3468	.memory_allocated = &net_aligned_data.tcp_memory_allocated,
3469	.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3470
3471	.memory_pressure = &tcp_memory_pressure,
3472	.sysctl_mem = sysctl_tcp_mem,
3473	.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3474	.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3475	.max_header = MAX_TCP_HEADER,
3476	.obj_size = sizeof(struct tcp_sock),
3477	.slab_flags = SLAB_TYPESAFE_BY_RCU,
3478	.twsk_prot = &tcp_timewait_sock_ops,
3479	.rsk_prot = &tcp_request_sock_ops,
3480	.h.hashinfo = NULL,
3481	.no_autobind = true,
3482	.diag_destroy = tcp_abort,
3483	};
3484	EXPORT_SYMBOL(tcp_prot);
3485
3486	static void __net_exit tcp_sk_exit(struct net *net)
3487	{
3488	if (net->ipv4.tcp_congestion_control)
3489	bpf_module_put(data: net->ipv4.tcp_congestion_control,
3490	owner: net->ipv4.tcp_congestion_control->owner);
3491	}
3492
3493	static void __net_init tcp_set_hashinfo(struct net *net)
3494	{
3495	struct inet_hashinfo *hinfo;
3496	unsigned int ehash_entries;
3497	struct net *old_net;
3498
3499	if (net_eq(net1: net, net2: &init_net))
3500	goto fallback;
3501
3502	old_net = current->nsproxy->net_ns;
3503	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3504	if (!ehash_entries)
3505	goto fallback;
3506
3507	ehash_entries = roundup_pow_of_two(ehash_entries);
3508	hinfo = inet_pernet_hashinfo_alloc(hashinfo: &tcp_hashinfo, ehash_entries);
3509	if (!hinfo) {
3510	pr_warn("Failed to allocate TCP ehash (entries: %u) "
3511	"for a netns, fallback to the global one\n",
3512	ehash_entries);
3513	fallback:
3514	hinfo = &tcp_hashinfo;
3515	ehash_entries = tcp_hashinfo.ehash_mask + `1`;
3516	}
3517
3518	net->ipv4.tcp_death_row.hashinfo = hinfo;
3519	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / `2`;
3520	net->ipv4.sysctl_max_syn_backlog = max(`128U`, ehash_entries / `128`);
3521	}
3522
3523	static int __net_init tcp_sk_init(struct net *net)
3524	{
3525	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3526	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3527	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3528	net->ipv4.sysctl_tcp_ecn_fallback = `1`;
3529
3530	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3531	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3532	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3533	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3534	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3535
3536	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3537	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3538	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3539
3540	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3541	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3542	net->ipv4.sysctl_tcp_syncookies = `1`;
3543	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3544	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3545	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3546	net->ipv4.sysctl_tcp_orphan_retries = `0`;
3547	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3548	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3549	net->ipv4.sysctl_tcp_tw_reuse = `2`;
3550	net->ipv4.sysctl_tcp_tw_reuse_delay = `1` * MSEC_PER_SEC;
3551	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = `1`;
3552
3553	refcount_set(r: &net->ipv4.tcp_death_row.tw_refcount, n: `1`);
3554	tcp_set_hashinfo(net);
3555
3556	net->ipv4.sysctl_tcp_sack = `1`;
3557	net->ipv4.sysctl_tcp_window_scaling = `1`;
3558	net->ipv4.sysctl_tcp_timestamps = `1`;
3559	net->ipv4.sysctl_tcp_early_retrans = `3`;
3560	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3561	net->ipv4.sysctl_tcp_slow_start_after_idle = `1`; / By default, RFC2861 behavior. /
3562	net->ipv4.sysctl_tcp_retrans_collapse = `1`;
3563	net->ipv4.sysctl_tcp_max_reordering = `300`;
3564	net->ipv4.sysctl_tcp_dsack = `1`;
3565	net->ipv4.sysctl_tcp_app_win = `31`;
3566	net->ipv4.sysctl_tcp_adv_win_scale = `1`;
3567	net->ipv4.sysctl_tcp_frto = `2`;
3568	net->ipv4.sysctl_tcp_moderate_rcvbuf = `1`;
3569	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3570	/ This limits the percentage of the congestion window which we*
3571	* will allow a single TSO frame to consume. Building TSO frames
3572	* which are too large can cause TCP streams to be bursty.
3573	*/
3574	net->ipv4.sysctl_tcp_tso_win_divisor = `3`;
3575	/ Default TSQ limit of 4 MB /
3576	net->ipv4.sysctl_tcp_limit_output_bytes = `4` << `20`;
3577
3578	/ rfc5961 challenge ack rate limiting, per net-ns, disabled by default. /
3579	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3580
3581	net->ipv4.sysctl_tcp_min_tso_segs = `2`;
3582	net->ipv4.sysctl_tcp_tso_rtt_log = `9`; / 2^9 = 512 usec /
3583	net->ipv4.sysctl_tcp_min_rtt_wlen = `300`;
3584	net->ipv4.sysctl_tcp_autocorking = `1`;
3585	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/`2`;
3586	net->ipv4.sysctl_tcp_pacing_ss_ratio = `200`;
3587	net->ipv4.sysctl_tcp_pacing_ca_ratio = `120`;
3588	if (net != &init_net) {
3589	memcpy(net->ipv4.sysctl_tcp_rmem,
3590	init_net.ipv4.sysctl_tcp_rmem,
3591	sizeof(init_net.ipv4.sysctl_tcp_rmem));
3592	memcpy(net->ipv4.sysctl_tcp_wmem,
3593	init_net.ipv4.sysctl_tcp_wmem,
3594	sizeof(init_net.ipv4.sysctl_tcp_wmem));
3595	}
3596	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3597	net->ipv4.sysctl_tcp_comp_sack_slack_ns = `10` * NSEC_PER_USEC;
3598	net->ipv4.sysctl_tcp_comp_sack_nr = `44`;
3599	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = `33`;
3600	net->ipv4.sysctl_tcp_backlog_ack_defer = `1`;
3601	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3602	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = `0`;
3603	atomic_set(v: &net->ipv4.tfo_active_disable_times, i: `0`);
3604
3605	/ Set default values for PLB /
3606	net->ipv4.sysctl_tcp_plb_enabled = `0`; / Disabled by default /
3607	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = `3`;
3608	net->ipv4.sysctl_tcp_plb_rehash_rounds = `12`;
3609	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = `60`;
3610	/ Default congestion threshold for PLB to mark a round is 50% /
3611	net->ipv4.sysctl_tcp_plb_cong_thresh = (`1` << TCP_PLB_SCALE) / `2`;
3612
3613	/ Reno is always built in /
3614	if (!net_eq(net1: net, net2: &init_net) &&
3615	bpf_try_module_get(data: init_net.ipv4.tcp_congestion_control,
3616	owner: init_net.ipv4.tcp_congestion_control->owner))
3617	net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3618	else
3619	net->ipv4.tcp_congestion_control = &tcp_reno;
3620
3621	net->ipv4.sysctl_tcp_syn_linear_timeouts = `4`;
3622	net->ipv4.sysctl_tcp_shrink_window = `0`;
3623
3624	net->ipv4.sysctl_tcp_pingpong_thresh = `1`;
3625	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3626	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3627
3628	return `0`;
3629	}
3630
3631	static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3632	{
3633	struct net *net;
3634
3635	/ make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work*
3636	* and failed setup_net error unwinding path are serialized.
3637	*
3638	* tcp_twsk_purge() handles twsk in any dead netns, not just those in
3639	* net_exit_list, the thread that dismantles a particular twsk must
3640	* do so without other thread progressing to refcount_dec_and_test() of
3641	* tcp_death_row.tw_refcount.
3642	*/
3643	mutex_lock(&tcp_exit_batch_mutex);
3644
3645	tcp_twsk_purge(net_exit_list);
3646
3647	list_for_each_entry(net, net_exit_list, exit_list) {
3648	inet_pernet_hashinfo_free(hashinfo: net->ipv4.tcp_death_row.hashinfo);
3649	WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3650	tcp_fastopen_ctx_destroy(net);
3651	}
3652
3653	mutex_unlock(lock: &tcp_exit_batch_mutex);
3654	}
3655
3656	static struct pernet_operations __net_initdata tcp_sk_ops = {
3657	.init = tcp_sk_init,
3658	.exit = tcp_sk_exit,
3659	.exit_batch = tcp_sk_exit_batch,
3660	};
3661
3662	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3663	DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3664	struct sock_common *sk_common, uid_t uid)
3665
3666	#define INIT_BATCH_SZ 16
3667
3668	static int bpf_iter_init_tcp(void priv_data, struct* bpf_iter_aux_info *aux)
3669	{
3670	struct bpf_tcp_iter_state *iter = priv_data;
3671	int err;
3672
3673	err = bpf_iter_init_seq_net(priv_data, aux);
3674	if (err)
3675	return err;
3676
3677	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3678	if (err) {
3679	bpf_iter_fini_seq_net(priv_data);
3680	return err;
3681	}
3682
3683	return `0`;
3684	}
3685
3686	static void bpf_iter_fini_tcp(void *priv_data)
3687	{
3688	struct bpf_tcp_iter_state *iter = priv_data;
3689
3690	bpf_iter_fini_seq_net(priv_data);
3691	kvfree(addr: iter->batch);
3692	}
3693
3694	static const struct bpf_iter_seq_info tcp_seq_info = {
3695	.seq_ops = &bpf_iter_tcp_seq_ops,
3696	.init_seq_private = bpf_iter_init_tcp,
3697	.fini_seq_private = bpf_iter_fini_tcp,
3698	.seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3699	};
3700
3701	static const struct bpf_func_proto *
3702	bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3703	const struct bpf_prog *prog)
3704	{
3705	switch (func_id) {
3706	case BPF_FUNC_setsockopt:
3707	return &bpf_sk_setsockopt_proto;
3708	case BPF_FUNC_getsockopt:
3709	return &bpf_sk_getsockopt_proto;
3710	default:
3711	return NULL;
3712	}
3713	}
3714
3715	static struct bpf_iter_reg tcp_reg_info = {
3716	.target = "tcp",
3717	.ctx_arg_info_size = `1`,
3718	.ctx_arg_info = {
3719	{ offsetof(struct bpf_iter__tcp, sk_common),
3720	PTR_TO_BTF_ID_OR_NULL \| PTR_TRUSTED },
3721	},
3722	.get_func_proto = bpf_iter_tcp_get_func_proto,
3723	.seq_info = &tcp_seq_info,
3724	};
3725
3726	static void __init bpf_iter_register(void)
3727	{
3728	tcp_reg_info.ctx_arg_info[`0`].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3729	if (bpf_iter_reg_target(reg_info: &tcp_reg_info))
3730	pr_warn("Warning: could not register bpf iterator tcp\n");
3731	}
3732
3733	#endif
3734
3735	void __init tcp_v4_init(void)
3736	{
3737	int cpu, res;
3738
3739	for_each_possible_cpu(cpu) {
3740	struct sock *sk;
3741
3742	res = inet_ctl_sock_create(sk: &sk, PF_INET, type: SOCK_RAW,
3743	IPPROTO_TCP, net: &init_net);
3744	if (res)
3745	panic(fmt: "Failed to create the TCP control socket.\n");
3746	sock_set_flag(sk, flag: SOCK_USE_WRITE_QUEUE);
3747
3748	/ Please enforce IP_DF and IPID==0 for RST and*
3749	* ACK sent in SYN-RECV and TIME-WAIT state.
3750	*/
3751	inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3752
3753	sk->sk_clockid = CLOCK_MONOTONIC;
3754
3755	per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3756	}
3757	if (register_pernet_subsys(&tcp_sk_ops))
3758	panic(fmt: "Failed to create the TCP control socket.\n");
3759
3760	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3761	bpf_iter_register();
3762	#endif
3763	}
3764

source code of linux/net/ipv4/tcp_ipv4.c