sock.c source code [linux/net/core/sock.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Generic socket support routines. Memory allocators, socket lock/release
8	* handler for protocols to use and generic option handler.
9	*
10	* Authors: Ross Biro
11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12	* Florian La Roche, <flla@stud.uni-sb.de>
13	* Alan Cox, <A.Cox@swansea.ac.uk>
14	*
15	* Fixes:
16	* Alan Cox : Numerous verify_area() problems
17	* Alan Cox : Connecting on a connecting socket
18	* now returns an error for tcp.
19	* Alan Cox : sock->protocol is set correctly.
20	* and is not sometimes left as 0.
21	* Alan Cox : connect handles icmp errors on a
22	* connect properly. Unfortunately there
23	* is a restart syscall nasty there. I
24	* can't match BSD without hacking the C
25	* library. Ideas urgently sought!
26	* Alan Cox : Disallow bind() to addresses that are
27	* not ours - especially broadcast ones!!
28	* Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29	* Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30	* instead they leave that for the DESTROY timer.
31	* Alan Cox : Clean up error flag in accept
32	* Alan Cox : TCP ack handling is buggy, the DESTROY timer
33	* was buggy. Put a remove_sock() in the handler
34	* for memory when we hit 0. Also altered the timer
35	* code. The ACK stuff can wait and needs major
36	* TCP layer surgery.
37	* Alan Cox : Fixed TCP ack bug, removed remove sock
38	* and fixed timer/inet_bh race.
39	* Alan Cox : Added zapped flag for TCP
40	* Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41	* Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42	* Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43	* Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44	* Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45	* Rick Sladkey : Relaxed UDP rules for matching packets.
46	* C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47	* Pauline Middelink : identd support
48	* Alan Cox : Fixed connect() taking signals I think.
49	* Alan Cox : SO_LINGER supported
50	* Alan Cox : Error reporting fixes
51	* Anonymous : inet_create tidied up (sk->reuse setting)
52	* Alan Cox : inet sockets don't set sk->type!
53	* Alan Cox : Split socket option code
54	* Alan Cox : Callbacks
55	* Alan Cox : Nagle flag for Charles & Johannes stuff
56	* Alex : Removed restriction on inet fioctl
57	* Alan Cox : Splitting INET from NET core
58	* Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59	* Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60	* Alan Cox : Split IP from generic code
61	* Alan Cox : New kfree_skbmem()
62	* Alan Cox : Make SO_DEBUG superuser only.
63	* Alan Cox : Allow anyone to clear SO_DEBUG
64	* (compatibility fix)
65	* Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66	* Alan Cox : Allocator for a socket is settable.
67	* Alan Cox : SO_ERROR includes soft errors.
68	* Alan Cox : Allow NULL arguments on some SO_ opts
69	* Alan Cox : Generic socket allocation to make hooks
70	* easier (suggested by Craig Metz).
71	* Michael Pall : SO_ERROR returns positive errno again
72	* Steve Whitehouse: Added default destructor to free
73	* protocol private data.
74	* Steve Whitehouse: Added various other default routines
75	* common to several socket families.
76	* Chris Evans : Call suser() check last on F_SETOWN
77	* Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78	* Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79	* Andi Kleen : Fix write_space callback
80	* Chris Evans : Security fixes - signedness again
81	* Arnaldo C. Melo : cleanups, use skb_queue_purge
82	*
83	* To Fix:
84	*/
85
86	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88	#include <linux/unaligned.h>
89	#include <linux/capability.h>
90	#include <linux/errno.h>
91	#include <linux/errqueue.h>
92	#include <linux/types.h>
93	#include <linux/socket.h>
94	#include <linux/in.h>
95	#include <linux/kernel.h>
96	#include <linux/module.h>
97	#include <linux/proc_fs.h>
98	#include <linux/seq_file.h>
99	#include <linux/sched.h>
100	#include <linux/sched/mm.h>
101	#include <linux/timer.h>
102	#include <linux/string.h>
103	#include <linux/sockios.h>
104	#include <linux/net.h>
105	#include <linux/mm.h>
106	#include <linux/slab.h>
107	#include <linux/interrupt.h>
108	#include <linux/poll.h>
109	#include <linux/tcp.h>
110	#include <linux/udp.h>
111	#include <linux/init.h>
112	#include <linux/highmem.h>
113	#include <linux/user_namespace.h>
114	#include <linux/static_key.h>
115	#include <linux/memcontrol.h>
116	#include <linux/prefetch.h>
117	#include <linux/compat.h>
118	#include <linux/mroute.h>
119	#include <linux/mroute6.h>
120	#include <linux/icmpv6.h>
121
122	#include <linux/uaccess.h>
123
124	#include <linux/netdevice.h>
125	#include <net/protocol.h>
126	#include <linux/skbuff.h>
127	#include <linux/skbuff_ref.h>
128	#include <net/net_namespace.h>
129	#include <net/request_sock.h>
130	#include <net/sock.h>
131	#include <net/proto_memory.h>
132	#include <linux/net_tstamp.h>
133	#include <net/xfrm.h>
134	#include <linux/ipsec.h>
135	#include <net/cls_cgroup.h>
136	#include <net/netprio_cgroup.h>
137	#include <linux/sock_diag.h>
138
139	#include <linux/filter.h>
140	#include <net/sock_reuseport.h>
141	#include <net/bpf_sk_storage.h>
142
143	#include <trace/events/sock.h>
144
145	#include <net/tcp.h>
146	#include <net/busy_poll.h>
147	#include <net/phonet/phonet.h>
148
149	#include <linux/ethtool.h>
150
151	#include <uapi/linux/pidfd.h>
152
153	#include "dev.h"
154
155	static DEFINE_MUTEX(proto_list_mutex);
156	static LIST_HEAD(proto_list);
157
158	static void sock_def_write_space_wfree(struct sock sk, int* wmem_alloc);
159	static void sock_def_write_space(struct sock *sk);
160
161	/**
162	* sk_ns_capable - General socket capability test
163	* @sk: Socket to use a capability on or through
164	* @user_ns: The user namespace of the capability to use
165	* @cap: The capability to use
166	*
167	* Test to see if the opener of the socket had when the socket was
168	* created and the current process has the capability @cap in the user
169	* namespace @user_ns.
170	*/
171	bool sk_ns_capable(const struct sock *sk,
172	struct user_namespace user_ns, int* cap)
173	{
174	return file_ns_capable(file: sk->sk_socket->file, ns: user_ns, cap) &&
175	ns_capable(ns: user_ns, cap);
176	}
177	EXPORT_SYMBOL(sk_ns_capable);
178
179	/**
180	* sk_capable - Socket global capability test
181	* @sk: Socket to use a capability on or through
182	* @cap: The global capability to use
183	*
184	* Test to see if the opener of the socket had when the socket was
185	* created and the current process has the capability @cap in all user
186	* namespaces.
187	*/
188	bool sk_capable(const struct sock sk, int* cap)
189	{
190	return sk_ns_capable(sk, &init_user_ns, cap);
191	}
192	EXPORT_SYMBOL(sk_capable);
193
194	/**
195	* sk_net_capable - Network namespace socket capability test
196	* @sk: Socket to use a capability on or through
197	* @cap: The capability to use
198	*
199	* Test to see if the opener of the socket had when the socket was created
200	* and the current process has the capability @cap over the network namespace
201	* the socket is a member of.
202	*/
203	bool sk_net_capable(const struct sock sk, int* cap)
204	{
205	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206	}
207	EXPORT_SYMBOL(sk_net_capable);
208
209	/*
210	* Each address family might have different locking rules, so we have
211	* one slock key per address family and separate keys for internal and
212	* userspace sockets.
213	*/
214	static struct lock_class_key af_family_keys[AF_MAX];
215	static struct lock_class_key af_family_kern_keys[AF_MAX];
216	static struct lock_class_key af_family_slock_keys[AF_MAX];
217	static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218
219	/*
220	* Make lock validator output more readable. (we pre-construct these
221	* strings build-time, so that runtime initialization of socket
222	* locks is fast):
223	*/
224
225	#define _sock_locks(x) \
226	x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
227	x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
228	x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
229	x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
230	x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
231	x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
232	x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
233	x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
234	x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
235	x "27" , x "28" , x "AF_CAN" , \
236	x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
237	x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
238	x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
239	x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
240	x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
241	x "AF_MCTP" , \
242	x "AF_MAX"
243
244	static const char *const af_family_key_strings[AF_MAX+`1`] = {
245	_sock_locks("sk_lock-")
246	};
247	static const char *const af_family_slock_key_strings[AF_MAX+`1`] = {
248	_sock_locks("slock-")
249	};
250	static const char *const af_family_clock_key_strings[AF_MAX+`1`] = {
251	_sock_locks("clock-")
252	};
253
254	static const char *const af_family_kern_key_strings[AF_MAX+`1`] = {
255	_sock_locks("k-sk_lock-")
256	};
257	static const char *const af_family_kern_slock_key_strings[AF_MAX+`1`] = {
258	_sock_locks("k-slock-")
259	};
260	static const char *const af_family_kern_clock_key_strings[AF_MAX+`1`] = {
261	_sock_locks("k-clock-")
262	};
263	static const char *const af_family_rlock_key_strings[AF_MAX+`1`] = {
264	_sock_locks("rlock-")
265	};
266	static const char *const af_family_wlock_key_strings[AF_MAX+`1`] = {
267	_sock_locks("wlock-")
268	};
269	static const char *const af_family_elock_key_strings[AF_MAX+`1`] = {
270	_sock_locks("elock-")
271	};
272
273	/*
274	* sk_callback_lock and sk queues locking rules are per-address-family,
275	* so split the lock classes by using a per-AF key:
276	*/
277	static struct lock_class_key af_callback_keys[AF_MAX];
278	static struct lock_class_key af_rlock_keys[AF_MAX];
279	static struct lock_class_key af_wlock_keys[AF_MAX];
280	static struct lock_class_key af_elock_keys[AF_MAX];
281	static struct lock_class_key af_kern_callback_keys[AF_MAX];
282
283	/ Run time adjustable parameters. /
284	__u32 sysctl_wmem_max __read_mostly = `4` << `20`;
285	EXPORT_SYMBOL(sysctl_wmem_max);
286	__u32 sysctl_rmem_max __read_mostly = `4` << `20`;
287	EXPORT_SYMBOL(sysctl_rmem_max);
288	__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
289	__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
290
291	DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292	EXPORT_SYMBOL_GPL(memalloc_socks_key);
293
294	/**
295	* sk_set_memalloc - sets %SOCK_MEMALLOC
296	* @sk: socket to set it on
297	*
298	* Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299	* It's the responsibility of the admin to adjust min_free_kbytes
300	* to meet the requirements
301	*/
302	void sk_set_memalloc(struct sock *sk)
303	{
304	sock_set_flag(sk, flag: SOCK_MEMALLOC);
305	sk->sk_allocation \|= __GFP_MEMALLOC;
306	static_branch_inc(&memalloc_socks_key);
307	}
308	EXPORT_SYMBOL_GPL(sk_set_memalloc);
309
310	void sk_clear_memalloc(struct sock *sk)
311	{
312	sock_reset_flag(sk, flag: SOCK_MEMALLOC);
313	sk->sk_allocation &= ~__GFP_MEMALLOC;
314	static_branch_dec(&memalloc_socks_key);
315
316	/*
317	* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318	* progress of swapping. SOCK_MEMALLOC may be cleared while
319	* it has rmem allocations due to the last swapfile being deactivated
320	* but there is a risk that the socket is unusable due to exceeding
321	* the rmem limits. Reclaim the reserves and obey rmem limits again.
322	*/
323	sk_mem_reclaim(sk);
324	}
325	EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326
327	int __sk_backlog_rcv(struct sock sk, struct* sk_buff *skb)
328	{
329	int ret;
330	unsigned int noreclaim_flag;
331
332	/ these should have been dropped before queueing /
333	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334
335	noreclaim_flag = memalloc_noreclaim_save();
336	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337	tcp_v6_do_rcv,
338	tcp_v4_do_rcv,
339	sk, skb);
340	memalloc_noreclaim_restore(flags: noreclaim_flag);
341
342	return ret;
343	}
344	EXPORT_SYMBOL(__sk_backlog_rcv);
345
346	void sk_error_report(struct sock *sk)
347	{
348	sk->sk_error_report(sk);
349
350	switch (sk->sk_family) {
351	case AF_INET:
352	fallthrough;
353	case AF_INET6:
354	trace_inet_sk_error_report(sk);
355	break;
356	default:
357	break;
358	}
359	}
360	EXPORT_SYMBOL(sk_error_report);
361
362	int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363	{
364	struct __kernel_sock_timeval tv;
365
366	if (timeo == MAX_SCHEDULE_TIMEOUT) {
367	tv.tv_sec = `0`;
368	tv.tv_usec = `0`;
369	} else {
370	tv.tv_sec = timeo / HZ;
371	tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372	}
373
374	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375	struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376	(struct* old_timeval32 *)optval = tv32;
377	return sizeof(tv32);
378	}
379
380	if (old_timeval) {
381	struct __kernel_old_timeval old_tv;
382	old_tv.tv_sec = tv.tv_sec;
383	old_tv.tv_usec = tv.tv_usec;
384	(struct* __kernel_old_timeval *)optval = old_tv;
385	return sizeof(old_tv);
386	}
387
388	(struct* __kernel_sock_timeval *)optval = tv;
389	return sizeof(tv);
390	}
391	EXPORT_SYMBOL(sock_get_timeout);
392
393	int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394	sockptr_t optval, int optlen, bool old_timeval)
395	{
396	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397	struct old_timeval32 tv32;
398
399	if (optlen < sizeof(tv32))
400	return -EINVAL;
401
402	if (copy_from_sockptr(dst: &tv32, src: optval, size: sizeof(tv32)))
403	return -EFAULT;
404	tv->tv_sec = tv32.tv_sec;
405	tv->tv_usec = tv32.tv_usec;
406	} else if (old_timeval) {
407	struct __kernel_old_timeval old_tv;
408
409	if (optlen < sizeof(old_tv))
410	return -EINVAL;
411	if (copy_from_sockptr(dst: &old_tv, src: optval, size: sizeof(old_tv)))
412	return -EFAULT;
413	tv->tv_sec = old_tv.tv_sec;
414	tv->tv_usec = old_tv.tv_usec;
415	} else {
416	if (optlen < sizeof(*tv))
417	return -EINVAL;
418	if (copy_from_sockptr(dst: tv, src: optval, size: sizeof(*tv)))
419	return -EFAULT;
420	}
421
422	return `0`;
423	}
424	EXPORT_SYMBOL(sock_copy_user_timeval);
425
426	static int sock_set_timeout(long timeo_p, sockptr_t optval, int* optlen,
427	bool old_timeval)
428	{
429	struct __kernel_sock_timeval tv;
430	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431	long val;
432
433	if (err)
434	return err;
435
436	if (tv.tv_usec < `0` \|\| tv.tv_usec >= USEC_PER_SEC)
437	return -EDOM;
438
439	if (tv.tv_sec < `0`) {
440	static int warned __read_mostly;
441
442	WRITE_ONCE(*timeo_p, `0`);
443	if (warned < `10` && net_ratelimit()) {
444	warned++;
445	pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446	__func__, current->comm, task_pid_nr(current));
447	}
448	return `0`;
449	}
450	val = MAX_SCHEDULE_TIMEOUT;
451	if ((tv.tv_sec \|\| tv.tv_usec) &&
452	(tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - `1`)))
453	val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454	USEC_PER_SEC / HZ);
455	WRITE_ONCE(*timeo_p, val);
456	return `0`;
457	}
458
459	static bool sk_set_prio_allowed(const struct sock sk, int* val)
460	{
461	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) \|\|
462	sockopt_ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_RAW) \|\|
463	sockopt_ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_ADMIN));
464	}
465
466	static bool sock_needs_netstamp(const struct sock *sk)
467	{
468	switch (sk->sk_family) {
469	case AF_UNSPEC:
470	case AF_UNIX:
471	return false;
472	default:
473	return true;
474	}
475	}
476
477	static void sock_disable_timestamp(struct sock sk, unsigned* long flags)
478	{
479	if (sk->sk_flags & flags) {
480	sk->sk_flags &= ~flags;
481	if (sock_needs_netstamp(sk) &&
482	!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
483	net_disable_timestamp();
484	}
485	}
486
487
488	int __sock_queue_rcv_skb(struct sock sk, struct* sk_buff *skb)
489	{
490	unsigned long flags;
491	struct sk_buff_head *list = &sk->sk_receive_queue;
492
493	if (atomic_read(v: &sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
494	sk_drops_inc(sk);
495	trace_sock_rcvqueue_full(sk, skb);
496	return -ENOMEM;
497	}
498
499	if (!sk_rmem_schedule(sk, skb, size: skb->truesize)) {
500	sk_drops_inc(sk);
501	return -ENOBUFS;
502	}
503
504	skb->dev = NULL;
505	skb_set_owner_r(skb, sk);
506
507	/ we escape from rcu protected region, make sure we dont leak*
508	* a norefcounted dst
509	*/
510	skb_dst_force(skb);
511
512	spin_lock_irqsave(&list->lock, flags);
513	sock_skb_set_dropcount(sk, skb);
514	__skb_queue_tail(list, newsk: skb);
515	spin_unlock_irqrestore(lock: &list->lock, flags);
516
517	if (!sock_flag(sk, flag: SOCK_DEAD))
518	sk->sk_data_ready(sk);
519	return `0`;
520	}
521	EXPORT_SYMBOL(__sock_queue_rcv_skb);
522
523	int sock_queue_rcv_skb_reason(struct sock sk, struct* sk_buff *skb,
524	enum skb_drop_reason *reason)
525	{
526	enum skb_drop_reason drop_reason;
527	int err;
528
529	err = sk_filter_reason(sk, skb, reason: &drop_reason);
530	if (err)
531	goto out;
532
533	err = __sock_queue_rcv_skb(sk, skb);
534	switch (err) {
535	case -ENOMEM:
536	drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
537	break;
538	case -ENOBUFS:
539	drop_reason = SKB_DROP_REASON_PROTO_MEM;
540	break;
541	default:
542	drop_reason = SKB_NOT_DROPPED_YET;
543	break;
544	}
545	out:
546	if (reason)
547	*reason = drop_reason;
548	return err;
549	}
550	EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
551
552	int __sk_receive_skb(struct sock sk, struct* sk_buff *skb,
553	const int nested, unsigned int trim_cap, bool refcounted)
554	{
555	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
556	int rc = NET_RX_SUCCESS;
557	int err;
558
559	if (sk_filter_trim_cap(sk, skb, cap: trim_cap, reason: &reason))
560	goto discard_and_relse;
561
562	skb->dev = NULL;
563
564	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
565	sk_drops_inc(sk);
566	reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
567	goto discard_and_relse;
568	}
569	if (nested)
570	bh_lock_sock_nested(sk);
571	else
572	bh_lock_sock(sk);
573	if (!sock_owned_by_user(sk)) {
574	/*
575	* trylock + unlock semantics:
576	*/
577	mutex_acquire(&sk->sk_lock.dep_map, `0`, `1`, _RET_IP_);
578
579	rc = sk_backlog_rcv(sk, skb);
580
581	mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
582	} else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
583	bh_unlock_sock(sk);
584	if (err == -ENOMEM)
585	reason = SKB_DROP_REASON_PFMEMALLOC;
586	if (err == -ENOBUFS)
587	reason = SKB_DROP_REASON_SOCKET_BACKLOG;
588	sk_drops_inc(sk);
589	goto discard_and_relse;
590	}
591
592	bh_unlock_sock(sk);
593	out:
594	if (refcounted)
595	sock_put(sk);
596	return rc;
597	discard_and_relse:
598	sk_skb_reason_drop(sk, skb, reason);
599	goto out;
600	}
601	EXPORT_SYMBOL(__sk_receive_skb);
602
603	INDIRECT_CALLABLE_DECLARE(struct dst_entry ip6_dst_check(struct* dst_entry *,
604	u32));
605	INDIRECT_CALLABLE_DECLARE(struct dst_entry ipv4_dst_check(struct* dst_entry *,
606	u32));
607	struct dst_entry __sk_dst_check(struct* sock *sk, u32 cookie)
608	{
609	struct dst_entry *dst = __sk_dst_get(sk);
610
611	if (dst && READ_ONCE(dst->obsolete) &&
612	INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
613	dst, cookie) == NULL) {
614	sk_tx_queue_clear(sk);
615	WRITE_ONCE(sk->sk_dst_pending_confirm, `0`);
616	RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
617	dst_release(dst);
618	return NULL;
619	}
620
621	return dst;
622	}
623	EXPORT_SYMBOL(__sk_dst_check);
624
625	struct dst_entry sk_dst_check(struct* sock *sk, u32 cookie)
626	{
627	struct dst_entry *dst = sk_dst_get(sk);
628
629	if (dst && READ_ONCE(dst->obsolete) &&
630	INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
631	dst, cookie) == NULL) {
632	sk_dst_reset(sk);
633	dst_release(dst);
634	return NULL;
635	}
636
637	return dst;
638	}
639	EXPORT_SYMBOL(sk_dst_check);
640
641	static int sock_bindtoindex_locked(struct sock sk, int* ifindex)
642	{
643	int ret = -ENOPROTOOPT;
644	#ifdef CONFIG_NETDEVICES
645	struct net *net = sock_net(sk);
646
647	/ Sorry... /
648	ret = -EPERM;
649	if (sk->sk_bound_dev_if && !ns_capable(ns: net->user_ns, CAP_NET_RAW))
650	goto out;
651
652	ret = -EINVAL;
653	if (ifindex < `0`)
654	goto out;
655
656	/ Paired with all READ_ONCE() done locklessly. /
657	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
658
659	if (sk->sk_prot->rehash)
660	sk->sk_prot->rehash(sk);
661	sk_dst_reset(sk);
662
663	ret = `0`;
664
665	out:
666	#endif
667
668	return ret;
669	}
670
671	int sock_bindtoindex(struct sock sk, int* ifindex, bool lock_sk)
672	{
673	int ret;
674
675	if (lock_sk)
676	lock_sock(sk);
677	ret = sock_bindtoindex_locked(sk, ifindex);
678	if (lock_sk)
679	release_sock(sk);
680
681	return ret;
682	}
683	EXPORT_SYMBOL(sock_bindtoindex);
684
685	static int sock_setbindtodevice(struct sock sk, sockptr_t optval, int* optlen)
686	{
687	int ret = -ENOPROTOOPT;
688	#ifdef CONFIG_NETDEVICES
689	struct net *net = sock_net(sk);
690	char devname[IFNAMSIZ];
691	int index;
692
693	ret = -EINVAL;
694	if (optlen < `0`)
695	goto out;
696
697	/ Bind this socket to a particular device like "eth0",*
698	* as specified in the passed interface name. If the
699	* name is "" or the option length is zero the socket
700	* is not bound.
701	*/
702	if (optlen > IFNAMSIZ - `1`)
703	optlen = IFNAMSIZ - `1`;
704	memset(devname, `0`, sizeof(devname));
705
706	ret = -EFAULT;
707	if (copy_from_sockptr(dst: devname, src: optval, size: optlen))
708	goto out;
709
710	index = `0`;
711	if (devname[`0`] != `'\0'`) {
712	struct net_device *dev;
713
714	rcu_read_lock();
715	dev = dev_get_by_name_rcu(net, name: devname);
716	if (dev)
717	index = dev->ifindex;
718	rcu_read_unlock();
719	ret = -ENODEV;
720	if (!dev)
721	goto out;
722	}
723
724	sockopt_lock_sock(sk);
725	ret = sock_bindtoindex_locked(sk, ifindex: index);
726	sockopt_release_sock(sk);
727	out:
728	#endif
729
730	return ret;
731	}
732
733	static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
734	sockptr_t optlen, int len)
735	{
736	int ret = -ENOPROTOOPT;
737	#ifdef CONFIG_NETDEVICES
738	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
739	struct net *net = sock_net(sk);
740	char devname[IFNAMSIZ];
741
742	if (bound_dev_if == `0`) {
743	len = `0`;
744	goto zero;
745	}
746
747	ret = -EINVAL;
748	if (len < IFNAMSIZ)
749	goto out;
750
751	ret = netdev_get_name(net, name: devname, ifindex: bound_dev_if);
752	if (ret)
753	goto out;
754
755	len = strlen(devname) + `1`;
756
757	ret = -EFAULT;
758	if (copy_to_sockptr(dst: optval, src: devname, size: len))
759	goto out;
760
761	zero:
762	ret = -EFAULT;
763	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
764	goto out;
765
766	ret = `0`;
767
768	out:
769	#endif
770
771	return ret;
772	}
773
774	bool sk_mc_loop(const struct sock *sk)
775	{
776	if (dev_recursion_level())
777	return false;
778	if (!sk)
779	return true;
780	/ IPV6_ADDRFORM can change sk->sk_family under us. /
781	switch (READ_ONCE(sk->sk_family)) {
782	case AF_INET:
783	return inet_test_bit(MC_LOOP, sk);
784	#if IS_ENABLED(CONFIG_IPV6)
785	case AF_INET6:
786	return inet6_test_bit(MC6_LOOP, sk);
787	#endif
788	}
789	WARN_ON_ONCE(`1`);
790	return true;
791	}
792	EXPORT_SYMBOL(sk_mc_loop);
793
794	void sock_set_reuseaddr(struct sock *sk)
795	{
796	lock_sock(sk);
797	sk->sk_reuse = SK_CAN_REUSE;
798	release_sock(sk);
799	}
800	EXPORT_SYMBOL(sock_set_reuseaddr);
801
802	void sock_set_reuseport(struct sock *sk)
803	{
804	lock_sock(sk);
805	sk->sk_reuseport = true;
806	release_sock(sk);
807	}
808	EXPORT_SYMBOL(sock_set_reuseport);
809
810	void sock_no_linger(struct sock *sk)
811	{
812	lock_sock(sk);
813	WRITE_ONCE(sk->sk_lingertime, `0`);
814	sock_set_flag(sk, flag: SOCK_LINGER);
815	release_sock(sk);
816	}
817	EXPORT_SYMBOL(sock_no_linger);
818
819	void sock_set_priority(struct sock *sk, u32 priority)
820	{
821	WRITE_ONCE(sk->sk_priority, priority);
822	}
823	EXPORT_SYMBOL(sock_set_priority);
824
825	void sock_set_sndtimeo(struct sock *sk, s64 secs)
826	{
827	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - `1`)
828	WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
829	else
830	WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
831	}
832	EXPORT_SYMBOL(sock_set_sndtimeo);
833
834	static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
835	{
836	sock_valbool_flag(sk, bit: SOCK_RCVTSTAMP, valbool: val);
837	sock_valbool_flag(sk, bit: SOCK_RCVTSTAMPNS, valbool: val && ns);
838	if (val) {
839	sock_valbool_flag(sk, bit: SOCK_TSTAMP_NEW, valbool: new);
840	sock_enable_timestamp(sk, flag: SOCK_TIMESTAMP);
841	}
842	}
843
844	void sock_set_timestamp(struct sock sk, int* optname, bool valbool)
845	{
846	switch (optname) {
847	case SO_TIMESTAMP_OLD:
848	__sock_set_timestamps(sk, val: valbool, new: false, ns: false);
849	break;
850	case SO_TIMESTAMP_NEW:
851	__sock_set_timestamps(sk, val: valbool, new: true, ns: false);
852	break;
853	case SO_TIMESTAMPNS_OLD:
854	__sock_set_timestamps(sk, val: valbool, new: false, ns: true);
855	break;
856	case SO_TIMESTAMPNS_NEW:
857	__sock_set_timestamps(sk, val: valbool, new: true, ns: true);
858	break;
859	}
860	}
861
862	static int sock_timestamping_bind_phc(struct sock sk, int* phc_index)
863	{
864	struct net *net = sock_net(sk);
865	struct net_device *dev = NULL;
866	bool match = false;
867	int *vclock_index;
868	int i, num;
869
870	if (sk->sk_bound_dev_if)
871	dev = dev_get_by_index(net, ifindex: sk->sk_bound_dev_if);
872
873	if (!dev) {
874	pr_err("%s: sock not bind to device\n", __func__);
875	return -EOPNOTSUPP;
876	}
877
878	num = ethtool_get_phc_vclocks(dev, vclock_index: &vclock_index);
879	dev_put(dev);
880
881	for (i = `0`; i < num; i++) {
882	if (*(vclock_index + i) == phc_index) {
883	match = true;
884	break;
885	}
886	}
887
888	if (num > `0`)
889	kfree(objp: vclock_index);
890
891	if (!match)
892	return -EINVAL;
893
894	WRITE_ONCE(sk->sk_bind_phc, phc_index);
895
896	return `0`;
897	}
898
899	int sock_set_timestamping(struct sock sk, int* optname,
900	struct so_timestamping timestamping)
901	{
902	int val = timestamping.flags;
903	int ret;
904
905	if (val & ~SOF_TIMESTAMPING_MASK)
906	return -EINVAL;
907
908	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909	!(val & SOF_TIMESTAMPING_OPT_ID))
910	return -EINVAL;
911
912	if (val & SOF_TIMESTAMPING_OPT_ID &&
913	!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914	if (sk_is_tcp(sk)) {
915	if ((`1` << sk->sk_state) &
916	(TCPF_CLOSE \| TCPF_LISTEN))
917	return -EINVAL;
918	if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919	atomic_set(v: &sk->sk_tskey, tcp_sk(sk)->write_seq);
920	else
921	atomic_set(v: &sk->sk_tskey, tcp_sk(sk)->snd_una);
922	} else {
923	atomic_set(v: &sk->sk_tskey, i: `0`);
924	}
925	}
926
927	if (val & SOF_TIMESTAMPING_OPT_STATS &&
928	!(val & SOF_TIMESTAMPING_OPT_TSONLY))
929	return -EINVAL;
930
931	if (val & SOF_TIMESTAMPING_BIND_PHC) {
932	ret = sock_timestamping_bind_phc(sk, phc_index: timestamping.bind_phc);
933	if (ret)
934	return ret;
935	}
936
937	WRITE_ONCE(sk->sk_tsflags, val);
938	sock_valbool_flag(sk, bit: SOCK_TSTAMP_NEW, valbool: optname == SO_TIMESTAMPING_NEW);
939	sock_valbool_flag(sk, bit: SOCK_TIMESTAMPING_ANY, valbool: !!(val & TSFLAGS_ANY));
940
941	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
942	sock_enable_timestamp(sk,
943	flag: SOCK_TIMESTAMPING_RX_SOFTWARE);
944	else
945	sock_disable_timestamp(sk,
946	flags: (`1UL` << SOCK_TIMESTAMPING_RX_SOFTWARE));
947	return `0`;
948	}
949
950	#if defined(CONFIG_CGROUP_BPF)
951	void bpf_skops_tx_timestamping(struct sock sk, struct* sk_buff skb, int* op)
952	{
953	struct bpf_sock_ops_kern sock_ops;
954
955	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
956	sock_ops.op = op;
957	sock_ops.is_fullsock = `1`;
958	sock_ops.sk = sk;
959	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: `0`);
960	__cgroup_bpf_run_filter_sock_ops(sk, sock_ops: &sock_ops, atype: CGROUP_SOCK_OPS);
961	}
962	#endif
963
964	void sock_set_keepalive(struct sock *sk)
965	{
966	lock_sock(sk);
967	if (sk->sk_prot->keepalive)
968	sk->sk_prot->keepalive(sk, true);
969	sock_valbool_flag(sk, bit: SOCK_KEEPOPEN, valbool: true);
970	release_sock(sk);
971	}
972	EXPORT_SYMBOL(sock_set_keepalive);
973
974	static void __sock_set_rcvbuf(struct sock sk, int* val)
975	{
976	/ Ensure val * 2 fits into an int, to prevent max_t() from treating it*
977	* as a negative value.
978	*/
979	val = min_t(int, val, INT_MAX / `2`);
980	sk->sk_userlocks \|= SOCK_RCVBUF_LOCK;
981
982	/ We double it on the way in to account for "struct sk_buff" etc.*
983	* overhead. Applications assume that the SO_RCVBUF setting they make
984	* will allow that much actual data to be received on that socket.
985	*
986	* Applications are unaware that "struct sk_buff" and other overheads
987	* allocate from the receive buffer during socket buffer allocation.
988	*
989	* And after considering the possible alternatives, returning the value
990	* we actually used in getsockopt is the most desirable behavior.
991	*/
992	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * `2`, SOCK_MIN_RCVBUF));
993	}
994
995	void sock_set_rcvbuf(struct sock sk, int* val)
996	{
997	lock_sock(sk);
998	__sock_set_rcvbuf(sk, val);
999	release_sock(sk);
1000	}
1001	EXPORT_SYMBOL(sock_set_rcvbuf);
1002
1003	static void __sock_set_mark(struct sock *sk, u32 val)
1004	{
1005	if (val != sk->sk_mark) {
1006	WRITE_ONCE(sk->sk_mark, val);
1007	sk_dst_reset(sk);
1008	}
1009	}
1010
1011	void sock_set_mark(struct sock *sk, u32 val)
1012	{
1013	lock_sock(sk);
1014	__sock_set_mark(sk, val);
1015	release_sock(sk);
1016	}
1017	EXPORT_SYMBOL(sock_set_mark);
1018
1019	static void sock_release_reserved_memory(struct sock sk, int* bytes)
1020	{
1021	/ Round down bytes to multiple of pages /
1022	bytes = round_down(bytes, PAGE_SIZE);
1023
1024	WARN_ON(bytes > sk->sk_reserved_mem);
1025	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1026	sk_mem_reclaim(sk);
1027	}
1028
1029	static int sock_reserve_memory(struct sock sk, int* bytes)
1030	{
1031	long allocated;
1032	bool charged;
1033	int pages;
1034
1035	if (!mem_cgroup_sk_enabled(sk) \|\| !sk_has_account(sk))
1036	return -EOPNOTSUPP;
1037
1038	if (!bytes)
1039	return `0`;
1040
1041	pages = sk_mem_pages(amt: bytes);
1042
1043	/ pre-charge to memcg /
1044	charged = mem_cgroup_sk_charge(sk, nr_pages: pages,
1045	GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
1046	if (!charged)
1047	return -ENOMEM;
1048
1049	if (sk->sk_bypass_prot_mem)
1050	goto success;
1051
1052	/ pre-charge to forward_alloc /
1053	sk_memory_allocated_add(sk, val: pages);
1054	allocated = sk_memory_allocated(sk);
1055
1056	/ If the system goes into memory pressure with this*
1057	* precharge, give up and return error.
1058	*/
1059	if (allocated > sk_prot_mem_limits(sk, index: `1`)) {
1060	sk_memory_allocated_sub(sk, val: pages);
1061	mem_cgroup_sk_uncharge(sk, nr_pages: pages);
1062	return -ENOMEM;
1063	}
1064
1065	success:
1066	sk_forward_alloc_add(sk, val: pages << PAGE_SHIFT);
1067
1068	WRITE_ONCE(sk->sk_reserved_mem,
1069	sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1070
1071	return `0`;
1072	}
1073
1074	#ifdef CONFIG_PAGE_POOL
1075
1076	/ This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED*
1077	* in 1 syscall. The limit exists to limit the amount of memory the kernel
1078	* allocates to copy these tokens, and to prevent looping over the frags for
1079	* too long.
1080	*/
1081	#define MAX_DONTNEED_TOKENS 128
1082	#define MAX_DONTNEED_FRAGS 1024
1083
1084	static noinline_for_stack int
1085	sock_devmem_dontneed(struct sock sk, sockptr_t optval, unsigned* int optlen)
1086	{
1087	unsigned int num_tokens, i, j, k, netmem_num = `0`;
1088	struct dmabuf_token *tokens;
1089	int ret = `0`, num_frags = `0`;
1090	netmem_ref netmems[`16`];
1091
1092	if (!sk_is_tcp(sk))
1093	return -EBADF;
1094
1095	if (optlen % sizeof(*tokens) \|\|
1096	optlen > sizeof(tokens) MAX_DONTNEED_TOKENS)
1097	return -EINVAL;
1098
1099	num_tokens = optlen / sizeof(*tokens);
1100	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1101	if (!tokens)
1102	return -ENOMEM;
1103
1104	if (copy_from_sockptr(dst: tokens, src: optval, size: optlen)) {
1105	kvfree(addr: tokens);
1106	return -EFAULT;
1107	}
1108
1109	xa_lock_bh(&sk->sk_user_frags);
1110	for (i = `0`; i < num_tokens; i++) {
1111	for (j = `0`; j < tokens[i].token_count; j++) {
1112	if (++num_frags > MAX_DONTNEED_FRAGS)
1113	goto frag_limit_reached;
1114
1115	netmem_ref netmem = (__force netmem_ref)__xa_erase(
1116	&sk->sk_user_frags, index: tokens[i].token_start + j);
1117
1118	if (!netmem \|\| WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1119	continue;
1120
1121	netmems[netmem_num++] = netmem;
1122	if (netmem_num == ARRAY_SIZE(netmems)) {
1123	xa_unlock_bh(&sk->sk_user_frags);
1124	for (k = `0`; k < netmem_num; k++)
1125	WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1126	netmem_num = `0`;
1127	xa_lock_bh(&sk->sk_user_frags);
1128	}
1129	ret++;
1130	}
1131	}
1132
1133	frag_limit_reached:
1134	xa_unlock_bh(&sk->sk_user_frags);
1135	for (k = `0`; k < netmem_num; k++)
1136	WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1137
1138	kvfree(addr: tokens);
1139	return ret;
1140	}
1141	#endif
1142
1143	void sockopt_lock_sock(struct sock *sk)
1144	{
1145	/ When current->bpf_ctx is set, the setsockopt is called from*
1146	* a bpf prog. bpf has ensured the sk lock has been
1147	* acquired before calling setsockopt().
1148	*/
1149	if (has_current_bpf_ctx())
1150	return;
1151
1152	lock_sock(sk);
1153	}
1154	EXPORT_SYMBOL(sockopt_lock_sock);
1155
1156	void sockopt_release_sock(struct sock *sk)
1157	{
1158	if (has_current_bpf_ctx())
1159	return;
1160
1161	release_sock(sk);
1162	}
1163	EXPORT_SYMBOL(sockopt_release_sock);
1164
1165	bool sockopt_ns_capable(struct user_namespace ns, int* cap)
1166	{
1167	return has_current_bpf_ctx() \|\| ns_capable(ns, cap);
1168	}
1169	EXPORT_SYMBOL(sockopt_ns_capable);
1170
1171	bool sockopt_capable(int cap)
1172	{
1173	return has_current_bpf_ctx() \|\| capable(cap);
1174	}
1175	EXPORT_SYMBOL(sockopt_capable);
1176
1177	static int sockopt_validate_clockid(__kernel_clockid_t value)
1178	{
1179	switch (value) {
1180	case CLOCK_REALTIME:
1181	case CLOCK_MONOTONIC:
1182	case CLOCK_TAI:
1183	return `0`;
1184	}
1185	return -EINVAL;
1186	}
1187
1188	/*
1189	* This is meant for all protocols to use and covers goings on
1190	* at the socket level. Everything here is generic.
1191	*/
1192
1193	int sk_setsockopt(struct sock sk, int* level, int optname,
1194	sockptr_t optval, unsigned int optlen)
1195	{
1196	struct so_timestamping timestamping;
1197	struct socket *sock = sk->sk_socket;
1198	struct sock_txtime sk_txtime;
1199	int val;
1200	int valbool;
1201	struct linger ling;
1202	int ret = `0`;
1203
1204	/*
1205	* Options without arguments
1206	*/
1207
1208	if (optname == SO_BINDTODEVICE)
1209	return sock_setbindtodevice(sk, optval, optlen);
1210
1211	if (optlen < sizeof(int))
1212	return -EINVAL;
1213
1214	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
1215	return -EFAULT;
1216
1217	valbool = val ? `1` : `0`;
1218
1219	/ handle options which do not require locking the socket. /
1220	switch (optname) {
1221	case SO_PRIORITY:
1222	if (sk_set_prio_allowed(sk, val)) {
1223	sock_set_priority(sk, val);
1224	return `0`;
1225	}
1226	return -EPERM;
1227	case SO_TYPE:
1228	case SO_PROTOCOL:
1229	case SO_DOMAIN:
1230	case SO_ERROR:
1231	return -ENOPROTOOPT;
1232	#ifdef CONFIG_NET_RX_BUSY_POLL
1233	case SO_BUSY_POLL:
1234	if (val < `0`)
1235	return -EINVAL;
1236	WRITE_ONCE(sk->sk_ll_usec, val);
1237	return `0`;
1238	case SO_PREFER_BUSY_POLL:
1239	if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1240	return -EPERM;
1241	WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1242	return `0`;
1243	case SO_BUSY_POLL_BUDGET:
1244	if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1245	!sockopt_capable(CAP_NET_ADMIN))
1246	return -EPERM;
1247	if (val < `0` \|\| val > U16_MAX)
1248	return -EINVAL;
1249	WRITE_ONCE(sk->sk_busy_poll_budget, val);
1250	return `0`;
1251	#endif
1252	case SO_MAX_PACING_RATE:
1253	{
1254	unsigned long ulval = (val == ~`0U`) ? ~`0UL` : (unsigned int)val;
1255	unsigned long pacing_rate;
1256
1257	if (sizeof(ulval) != sizeof(val) &&
1258	optlen >= sizeof(ulval) &&
1259	copy_from_sockptr(dst: &ulval, src: optval, size: sizeof(ulval))) {
1260	return -EFAULT;
1261	}
1262	if (ulval != ~`0UL`)
1263	cmpxchg(&sk->sk_pacing_status,
1264	SK_PACING_NONE,
1265	SK_PACING_NEEDED);
1266	/ Pairs with READ_ONCE() from sk_getsockopt() /
1267	WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1268	pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1269	if (ulval < pacing_rate)
1270	WRITE_ONCE(sk->sk_pacing_rate, ulval);
1271	return `0`;
1272	}
1273	case SO_TXREHASH:
1274	if (!sk_is_tcp(sk))
1275	return -EOPNOTSUPP;
1276	if (val < -`1` \|\| val > `1`)
1277	return -EINVAL;
1278	if ((u8)val == SOCK_TXREHASH_DEFAULT)
1279	val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1280	/ Paired with READ_ONCE() in tcp_rtx_synack()*
1281	* and sk_getsockopt().
1282	*/
1283	WRITE_ONCE(sk->sk_txrehash, (u8)val);
1284	return `0`;
1285	case SO_PEEK_OFF:
1286	{
1287	int (set_peek_off)(struct* sock sk, int* val);
1288
1289	set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1290	if (set_peek_off)
1291	ret = set_peek_off(sk, val);
1292	else
1293	ret = -EOPNOTSUPP;
1294	return ret;
1295	}
1296	#ifdef CONFIG_PAGE_POOL
1297	case SO_DEVMEM_DONTNEED:
1298	return sock_devmem_dontneed(sk, optval, optlen);
1299	#endif
1300	case SO_SNDTIMEO_OLD:
1301	case SO_SNDTIMEO_NEW:
1302	return sock_set_timeout(timeo_p: &sk->sk_sndtimeo, optval,
1303	optlen, old_timeval: optname == SO_SNDTIMEO_OLD);
1304	case SO_RCVTIMEO_OLD:
1305	case SO_RCVTIMEO_NEW:
1306	return sock_set_timeout(timeo_p: &sk->sk_rcvtimeo, optval,
1307	optlen, old_timeval: optname == SO_RCVTIMEO_OLD);
1308	}
1309
1310	sockopt_lock_sock(sk);
1311
1312	switch (optname) {
1313	case SO_DEBUG:
1314	if (val && !sockopt_capable(CAP_NET_ADMIN))
1315	ret = -EACCES;
1316	else
1317	sock_valbool_flag(sk, bit: SOCK_DBG, valbool);
1318	break;
1319	case SO_REUSEADDR:
1320	sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1321	break;
1322	case SO_REUSEPORT:
1323	if (valbool && !sk_is_inet(sk))
1324	ret = -EOPNOTSUPP;
1325	else
1326	sk->sk_reuseport = valbool;
1327	break;
1328	case SO_DONTROUTE:
1329	sock_valbool_flag(sk, bit: SOCK_LOCALROUTE, valbool);
1330	sk_dst_reset(sk);
1331	break;
1332	case SO_BROADCAST:
1333	sock_valbool_flag(sk, bit: SOCK_BROADCAST, valbool);
1334	break;
1335	case SO_SNDBUF:
1336	/ Don't error on this BSD doesn't and if you think*
1337	* about it this is right. Otherwise apps have to
1338	* play 'guess the biggest size' games. RCVBUF/SNDBUF
1339	* are treated in BSD as hints
1340	*/
1341	val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1342	set_sndbuf:
1343	/ Ensure val * 2 fits into an int, to prevent max_t()*
1344	* from treating it as a negative value.
1345	*/
1346	val = min_t(int, val, INT_MAX / `2`);
1347	sk->sk_userlocks \|= SOCK_SNDBUF_LOCK;
1348	WRITE_ONCE(sk->sk_sndbuf,
1349	max_t(int, val * `2`, SOCK_MIN_SNDBUF));
1350	/ Wake up sending tasks if we upped the value. /
1351	sk->sk_write_space(sk);
1352	break;
1353
1354	case SO_SNDBUFFORCE:
1355	if (!sockopt_capable(CAP_NET_ADMIN)) {
1356	ret = -EPERM;
1357	break;
1358	}
1359
1360	/ No negative values (to prevent underflow, as val will be*
1361	* multiplied by 2).
1362	*/
1363	if (val < `0`)
1364	val = `0`;
1365	goto set_sndbuf;
1366
1367	case SO_RCVBUF:
1368	/ Don't error on this BSD doesn't and if you think*
1369	* about it this is right. Otherwise apps have to
1370	* play 'guess the biggest size' games. RCVBUF/SNDBUF
1371	* are treated in BSD as hints
1372	*/
1373	__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1374	break;
1375
1376	case SO_RCVBUFFORCE:
1377	if (!sockopt_capable(CAP_NET_ADMIN)) {
1378	ret = -EPERM;
1379	break;
1380	}
1381
1382	/ No negative values (to prevent underflow, as val will be*
1383	* multiplied by 2).
1384	*/
1385	__sock_set_rcvbuf(sk, max(val, `0`));
1386	break;
1387
1388	case SO_KEEPALIVE:
1389	if (sk->sk_prot->keepalive)
1390	sk->sk_prot->keepalive(sk, valbool);
1391	sock_valbool_flag(sk, bit: SOCK_KEEPOPEN, valbool);
1392	break;
1393
1394	case SO_OOBINLINE:
1395	sock_valbool_flag(sk, bit: SOCK_URGINLINE, valbool);
1396	break;
1397
1398	case SO_NO_CHECK:
1399	sk->sk_no_check_tx = valbool;
1400	break;
1401
1402	case SO_LINGER:
1403	if (optlen < sizeof(ling)) {
1404	ret = -EINVAL; / 1003.1g /
1405	break;
1406	}
1407	if (copy_from_sockptr(dst: &ling, src: optval, size: sizeof(ling))) {
1408	ret = -EFAULT;
1409	break;
1410	}
1411	if (!ling.l_onoff) {
1412	sock_reset_flag(sk, flag: SOCK_LINGER);
1413	} else {
1414	unsigned long t_sec = ling.l_linger;
1415
1416	if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1417	WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1418	else
1419	WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1420	sock_set_flag(sk, flag: SOCK_LINGER);
1421	}
1422	break;
1423
1424	case SO_BSDCOMPAT:
1425	break;
1426
1427	case SO_TIMESTAMP_OLD:
1428	case SO_TIMESTAMP_NEW:
1429	case SO_TIMESTAMPNS_OLD:
1430	case SO_TIMESTAMPNS_NEW:
1431	sock_set_timestamp(sk, optname, valbool);
1432	break;
1433
1434	case SO_TIMESTAMPING_NEW:
1435	case SO_TIMESTAMPING_OLD:
1436	if (optlen == sizeof(timestamping)) {
1437	if (copy_from_sockptr(dst: &timestamping, src: optval,
1438	size: sizeof(timestamping))) {
1439	ret = -EFAULT;
1440	break;
1441	}
1442	} else {
1443	memset(&timestamping, `0`, sizeof(timestamping));
1444	timestamping.flags = val;
1445	}
1446	ret = sock_set_timestamping(sk, optname, timestamping);
1447	break;
1448
1449	case SO_RCVLOWAT:
1450	{
1451	int (set_rcvlowat)(struct* sock sk, int* val) = NULL;
1452
1453	if (val < `0`)
1454	val = INT_MAX;
1455	if (sock)
1456	set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1457	if (set_rcvlowat)
1458	ret = set_rcvlowat(sk, val);
1459	else
1460	WRITE_ONCE(sk->sk_rcvlowat, val ? : `1`);
1461	break;
1462	}
1463	case SO_ATTACH_FILTER: {
1464	struct sock_fprog fprog;
1465
1466	ret = copy_bpf_fprog_from_user(dst: &fprog, src: optval, len: optlen);
1467	if (!ret)
1468	ret = sk_attach_filter(fprog: &fprog, sk);
1469	break;
1470	}
1471	case SO_ATTACH_BPF:
1472	ret = -EINVAL;
1473	if (optlen == sizeof(u32)) {
1474	u32 ufd;
1475
1476	ret = -EFAULT;
1477	if (copy_from_sockptr(dst: &ufd, src: optval, size: sizeof(ufd)))
1478	break;
1479
1480	ret = sk_attach_bpf(ufd, sk);
1481	}
1482	break;
1483
1484	case SO_ATTACH_REUSEPORT_CBPF: {
1485	struct sock_fprog fprog;
1486
1487	ret = copy_bpf_fprog_from_user(dst: &fprog, src: optval, len: optlen);
1488	if (!ret)
1489	ret = sk_reuseport_attach_filter(fprog: &fprog, sk);
1490	break;
1491	}
1492	case SO_ATTACH_REUSEPORT_EBPF:
1493	ret = -EINVAL;
1494	if (optlen == sizeof(u32)) {
1495	u32 ufd;
1496
1497	ret = -EFAULT;
1498	if (copy_from_sockptr(dst: &ufd, src: optval, size: sizeof(ufd)))
1499	break;
1500
1501	ret = sk_reuseport_attach_bpf(ufd, sk);
1502	}
1503	break;
1504
1505	case SO_DETACH_REUSEPORT_BPF:
1506	ret = reuseport_detach_prog(sk);
1507	break;
1508
1509	case SO_DETACH_FILTER:
1510	ret = sk_detach_filter(sk);
1511	break;
1512
1513	case SO_LOCK_FILTER:
1514	if (sock_flag(sk, flag: SOCK_FILTER_LOCKED) && !valbool)
1515	ret = -EPERM;
1516	else
1517	sock_valbool_flag(sk, bit: SOCK_FILTER_LOCKED, valbool);
1518	break;
1519
1520	case SO_MARK:
1521	if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1522	!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1523	ret = -EPERM;
1524	break;
1525	}
1526
1527	__sock_set_mark(sk, val);
1528	break;
1529	case SO_RCVMARK:
1530	sock_valbool_flag(sk, bit: SOCK_RCVMARK, valbool);
1531	break;
1532
1533	case SO_RCVPRIORITY:
1534	sock_valbool_flag(sk, bit: SOCK_RCVPRIORITY, valbool);
1535	break;
1536
1537	case SO_RXQ_OVFL:
1538	sock_valbool_flag(sk, bit: SOCK_RXQ_OVFL, valbool);
1539	break;
1540
1541	case SO_WIFI_STATUS:
1542	sock_valbool_flag(sk, bit: SOCK_WIFI_STATUS, valbool);
1543	break;
1544
1545	case SO_NOFCS:
1546	sock_valbool_flag(sk, bit: SOCK_NOFCS, valbool);
1547	break;
1548
1549	case SO_SELECT_ERR_QUEUE:
1550	sock_valbool_flag(sk, bit: SOCK_SELECT_ERR_QUEUE, valbool);
1551	break;
1552
1553	case SO_PASSCRED:
1554	if (sk_may_scm_recv(sk))
1555	sk->sk_scm_credentials = valbool;
1556	else
1557	ret = -EOPNOTSUPP;
1558	break;
1559
1560	case SO_PASSSEC:
1561	if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1562	sk->sk_scm_security = valbool;
1563	else
1564	ret = -EOPNOTSUPP;
1565	break;
1566
1567	case SO_PASSPIDFD:
1568	if (sk_is_unix(sk))
1569	sk->sk_scm_pidfd = valbool;
1570	else
1571	ret = -EOPNOTSUPP;
1572	break;
1573
1574	case SO_PASSRIGHTS:
1575	if (sk_is_unix(sk))
1576	sk->sk_scm_rights = valbool;
1577	else
1578	ret = -EOPNOTSUPP;
1579	break;
1580
1581	case SO_INCOMING_CPU:
1582	reuseport_update_incoming_cpu(sk, val);
1583	break;
1584
1585	case SO_CNX_ADVICE:
1586	if (val == `1`)
1587	dst_negative_advice(sk);
1588	break;
1589
1590	case SO_ZEROCOPY:
1591	if (sk->sk_family == PF_INET \|\| sk->sk_family == PF_INET6) {
1592	if (!(sk_is_tcp(sk) \|\|
1593	(sk->sk_type == SOCK_DGRAM &&
1594	sk->sk_protocol == IPPROTO_UDP)))
1595	ret = -EOPNOTSUPP;
1596	} else if (sk->sk_family != PF_RDS) {
1597	ret = -EOPNOTSUPP;
1598	}
1599	if (!ret) {
1600	if (val < `0` \|\| val > `1`)
1601	ret = -EINVAL;
1602	else
1603	sock_valbool_flag(sk, bit: SOCK_ZEROCOPY, valbool);
1604	}
1605	break;
1606
1607	case SO_TXTIME:
1608	if (optlen != sizeof(struct sock_txtime)) {
1609	ret = -EINVAL;
1610	break;
1611	} else if (copy_from_sockptr(dst: &sk_txtime, src: optval,
1612	size: sizeof(struct sock_txtime))) {
1613	ret = -EFAULT;
1614	break;
1615	} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1616	ret = -EINVAL;
1617	break;
1618	}
1619	/ CLOCK_MONOTONIC is only used by sch_fq, and this packet*
1620	* scheduler has enough safe guards.
1621	*/
1622	if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1623	!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1624	ret = -EPERM;
1625	break;
1626	}
1627
1628	ret = sockopt_validate_clockid(value: sk_txtime.clockid);
1629	if (ret)
1630	break;
1631
1632	sock_valbool_flag(sk, bit: SOCK_TXTIME, valbool: true);
1633	sk->sk_clockid = sk_txtime.clockid;
1634	sk->sk_txtime_deadline_mode =
1635	!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1636	sk->sk_txtime_report_errors =
1637	!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1638	break;
1639
1640	case SO_BINDTOIFINDEX:
1641	ret = sock_bindtoindex_locked(sk, ifindex: val);
1642	break;
1643
1644	case SO_BUF_LOCK:
1645	if (val & ~SOCK_BUF_LOCK_MASK) {
1646	ret = -EINVAL;
1647	break;
1648	}
1649	sk->sk_userlocks = val \| (sk->sk_userlocks &
1650	~SOCK_BUF_LOCK_MASK);
1651	break;
1652
1653	case SO_RESERVE_MEM:
1654	{
1655	int delta;
1656
1657	if (val < `0`) {
1658	ret = -EINVAL;
1659	break;
1660	}
1661
1662	delta = val - sk->sk_reserved_mem;
1663	if (delta < `0`)
1664	sock_release_reserved_memory(sk, bytes: -delta);
1665	else
1666	ret = sock_reserve_memory(sk, bytes: delta);
1667	break;
1668	}
1669
1670	default:
1671	ret = -ENOPROTOOPT;
1672	break;
1673	}
1674	sockopt_release_sock(sk);
1675	return ret;
1676	}
1677
1678	int sock_setsockopt(struct socket sock, int* level, int optname,
1679	sockptr_t optval, unsigned int optlen)
1680	{
1681	return sk_setsockopt(sk: sock->sk, level, optname,
1682	optval, optlen);
1683	}
1684	EXPORT_SYMBOL(sock_setsockopt);
1685
1686	static const struct cred sk_get_peer_cred(struct* sock *sk)
1687	{
1688	const struct cred *cred;
1689
1690	spin_lock(lock: &sk->sk_peer_lock);
1691	cred = get_cred(cred: sk->sk_peer_cred);
1692	spin_unlock(lock: &sk->sk_peer_lock);
1693
1694	return cred;
1695	}
1696
1697	static void cred_to_ucred(struct pid pid, const* struct cred *cred,
1698	struct ucred *ucred)
1699	{
1700	ucred->pid = pid_vnr(pid);
1701	ucred->uid = ucred->gid = -`1`;
1702	if (cred) {
1703	struct user_namespace *current_ns = current_user_ns();
1704
1705	ucred->uid = from_kuid_munged(to: current_ns, uid: cred->euid);
1706	ucred->gid = from_kgid_munged(to: current_ns, gid: cred->egid);
1707	}
1708	}
1709
1710	static int groups_to_user(sockptr_t dst, const struct group_info *src)
1711	{
1712	struct user_namespace *user_ns = current_user_ns();
1713	int i;
1714
1715	for (i = `0`; i < src->ngroups; i++) {
1716	gid_t gid = from_kgid_munged(to: user_ns, gid: src->gid[i]);
1717
1718	if (copy_to_sockptr_offset(dst, offset: i * sizeof(gid), src: &gid, size: sizeof(gid)))
1719	return -EFAULT;
1720	}
1721
1722	return `0`;
1723	}
1724
1725	int sk_getsockopt(struct sock sk, int* level, int optname,
1726	sockptr_t optval, sockptr_t optlen)
1727	{
1728	struct socket *sock = sk->sk_socket;
1729
1730	union {
1731	int val;
1732	u64 val64;
1733	unsigned long ulval;
1734	struct linger ling;
1735	struct old_timeval32 tm32;
1736	struct __kernel_old_timeval tm;
1737	struct __kernel_sock_timeval stm;
1738	struct sock_txtime txtime;
1739	struct so_timestamping timestamping;
1740	} v;
1741
1742	int lv = sizeof(int);
1743	int len;
1744
1745	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
1746	return -EFAULT;
1747	if (len < `0`)
1748	return -EINVAL;
1749
1750	memset(&v, `0`, sizeof(v));
1751
1752	switch (optname) {
1753	case SO_DEBUG:
1754	v.val = sock_flag(sk, flag: SOCK_DBG);
1755	break;
1756
1757	case SO_DONTROUTE:
1758	v.val = sock_flag(sk, flag: SOCK_LOCALROUTE);
1759	break;
1760
1761	case SO_BROADCAST:
1762	v.val = sock_flag(sk, flag: SOCK_BROADCAST);
1763	break;
1764
1765	case SO_SNDBUF:
1766	v.val = READ_ONCE(sk->sk_sndbuf);
1767	break;
1768
1769	case SO_RCVBUF:
1770	v.val = READ_ONCE(sk->sk_rcvbuf);
1771	break;
1772
1773	case SO_REUSEADDR:
1774	v.val = sk->sk_reuse;
1775	break;
1776
1777	case SO_REUSEPORT:
1778	v.val = sk->sk_reuseport;
1779	break;
1780
1781	case SO_KEEPALIVE:
1782	v.val = sock_flag(sk, flag: SOCK_KEEPOPEN);
1783	break;
1784
1785	case SO_TYPE:
1786	v.val = sk->sk_type;
1787	break;
1788
1789	case SO_PROTOCOL:
1790	v.val = sk->sk_protocol;
1791	break;
1792
1793	case SO_DOMAIN:
1794	v.val = sk->sk_family;
1795	break;
1796
1797	case SO_ERROR:
1798	v.val = -sock_error(sk);
1799	if (v.val == `0`)
1800	v.val = xchg(&sk->sk_err_soft, `0`);
1801	break;
1802
1803	case SO_OOBINLINE:
1804	v.val = sock_flag(sk, flag: SOCK_URGINLINE);
1805	break;
1806
1807	case SO_NO_CHECK:
1808	v.val = sk->sk_no_check_tx;
1809	break;
1810
1811	case SO_PRIORITY:
1812	v.val = READ_ONCE(sk->sk_priority);
1813	break;
1814
1815	case SO_LINGER:
1816	lv = sizeof(v.ling);
1817	v.ling.l_onoff = sock_flag(sk, flag: SOCK_LINGER);
1818	v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1819	break;
1820
1821	case SO_BSDCOMPAT:
1822	break;
1823
1824	case SO_TIMESTAMP_OLD:
1825	v.val = sock_flag(sk, flag: SOCK_RCVTSTAMP) &&
1826	!sock_flag(sk, flag: SOCK_TSTAMP_NEW) &&
1827	!sock_flag(sk, flag: SOCK_RCVTSTAMPNS);
1828	break;
1829
1830	case SO_TIMESTAMPNS_OLD:
1831	v.val = sock_flag(sk, flag: SOCK_RCVTSTAMPNS) && !sock_flag(sk, flag: SOCK_TSTAMP_NEW);
1832	break;
1833
1834	case SO_TIMESTAMP_NEW:
1835	v.val = sock_flag(sk, flag: SOCK_RCVTSTAMP) && sock_flag(sk, flag: SOCK_TSTAMP_NEW);
1836	break;
1837
1838	case SO_TIMESTAMPNS_NEW:
1839	v.val = sock_flag(sk, flag: SOCK_RCVTSTAMPNS) && sock_flag(sk, flag: SOCK_TSTAMP_NEW);
1840	break;
1841
1842	case SO_TIMESTAMPING_OLD:
1843	case SO_TIMESTAMPING_NEW:
1844	lv = sizeof(v.timestamping);
1845	/ For the later-added case SO_TIMESTAMPING_NEW: Be strict about only*
1846	* returning the flags when they were set through the same option.
1847	* Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1848	*/
1849	if (optname == SO_TIMESTAMPING_OLD \|\| sock_flag(sk, flag: SOCK_TSTAMP_NEW)) {
1850	v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1851	v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1852	}
1853	break;
1854
1855	case SO_RCVTIMEO_OLD:
1856	case SO_RCVTIMEO_NEW:
1857	lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1858	SO_RCVTIMEO_OLD == optname);
1859	break;
1860
1861	case SO_SNDTIMEO_OLD:
1862	case SO_SNDTIMEO_NEW:
1863	lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1864	SO_SNDTIMEO_OLD == optname);
1865	break;
1866
1867	case SO_RCVLOWAT:
1868	v.val = READ_ONCE(sk->sk_rcvlowat);
1869	break;
1870
1871	case SO_SNDLOWAT:
1872	v.val = `1`;
1873	break;
1874
1875	case SO_PASSCRED:
1876	if (!sk_may_scm_recv(sk))
1877	return -EOPNOTSUPP;
1878
1879	v.val = sk->sk_scm_credentials;
1880	break;
1881
1882	case SO_PASSPIDFD:
1883	if (!sk_is_unix(sk))
1884	return -EOPNOTSUPP;
1885
1886	v.val = sk->sk_scm_pidfd;
1887	break;
1888
1889	case SO_PASSRIGHTS:
1890	if (!sk_is_unix(sk))
1891	return -EOPNOTSUPP;
1892
1893	v.val = sk->sk_scm_rights;
1894	break;
1895
1896	case SO_PEERCRED:
1897	{
1898	struct ucred peercred;
1899	if (len > sizeof(peercred))
1900	len = sizeof(peercred);
1901
1902	spin_lock(lock: &sk->sk_peer_lock);
1903	cred_to_ucred(pid: sk->sk_peer_pid, cred: sk->sk_peer_cred, ucred: &peercred);
1904	spin_unlock(lock: &sk->sk_peer_lock);
1905
1906	if (copy_to_sockptr(dst: optval, src: &peercred, size: len))
1907	return -EFAULT;
1908	goto lenout;
1909	}
1910
1911	case SO_PEERPIDFD:
1912	{
1913	struct pid *peer_pid;
1914	struct file *pidfd_file = NULL;
1915	unsigned int flags = `0`;
1916	int pidfd;
1917
1918	if (len > sizeof(pidfd))
1919	len = sizeof(pidfd);
1920
1921	spin_lock(lock: &sk->sk_peer_lock);
1922	peer_pid = get_pid(pid: sk->sk_peer_pid);
1923	spin_unlock(lock: &sk->sk_peer_lock);
1924
1925	if (!peer_pid)
1926	return -ENODATA;
1927
1928	/ The use of PIDFD_STALE requires stashing of struct pid*
1929	* on pidfs with pidfs_register_pid() and only AF_UNIX
1930	* were prepared for this.
1931	*/
1932	if (sk->sk_family == AF_UNIX)
1933	flags = PIDFD_STALE;
1934
1935	pidfd = pidfd_prepare(pid: peer_pid, flags, ret_file: &pidfd_file);
1936	put_pid(pid: peer_pid);
1937	if (pidfd < `0`)
1938	return pidfd;
1939
1940	if (copy_to_sockptr(dst: optval, src: &pidfd, size: len) \|\|
1941	copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int))) {
1942	put_unused_fd(fd: pidfd);
1943	fput(pidfd_file);
1944
1945	return -EFAULT;
1946	}
1947
1948	fd_install(fd: pidfd, file: pidfd_file);
1949	return `0`;
1950	}
1951
1952	case SO_PEERGROUPS:
1953	{
1954	const struct cred *cred;
1955	int ret, n;
1956
1957	cred = sk_get_peer_cred(sk);
1958	if (!cred)
1959	return -ENODATA;
1960
1961	n = cred->group_info->ngroups;
1962	if (len < n * sizeof(gid_t)) {
1963	len = n * sizeof(gid_t);
1964	put_cred(cred);
1965	return copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)) ? -EFAULT : -ERANGE;
1966	}
1967	len = n * sizeof(gid_t);
1968
1969	ret = groups_to_user(dst: optval, src: cred->group_info);
1970	put_cred(cred);
1971	if (ret)
1972	return ret;
1973	goto lenout;
1974	}
1975
1976	case SO_PEERNAME:
1977	{
1978	struct sockaddr_storage address;
1979
1980	lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, `2`);
1981	if (lv < `0`)
1982	return -ENOTCONN;
1983	if (lv < len)
1984	return -EINVAL;
1985	if (copy_to_sockptr(dst: optval, src: &address, size: len))
1986	return -EFAULT;
1987	goto lenout;
1988	}
1989
1990	/ Dubious BSD thing... Probably nobody even uses it, but*
1991	* the UNIX standard wants it for whatever reason... -DaveM
1992	*/
1993	case SO_ACCEPTCONN:
1994	v.val = sk->sk_state == TCP_LISTEN;
1995	break;
1996
1997	case SO_PASSSEC:
1998	if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) \|\| !sk_may_scm_recv(sk))
1999	return -EOPNOTSUPP;
2000
2001	v.val = sk->sk_scm_security;
2002	break;
2003
2004	case SO_PEERSEC:
2005	return security_socket_getpeersec_stream(sock,
2006	optval, optlen, len);
2007
2008	case SO_MARK:
2009	v.val = READ_ONCE(sk->sk_mark);
2010	break;
2011
2012	case SO_RCVMARK:
2013	v.val = sock_flag(sk, flag: SOCK_RCVMARK);
2014	break;
2015
2016	case SO_RCVPRIORITY:
2017	v.val = sock_flag(sk, flag: SOCK_RCVPRIORITY);
2018	break;
2019
2020	case SO_RXQ_OVFL:
2021	v.val = sock_flag(sk, flag: SOCK_RXQ_OVFL);
2022	break;
2023
2024	case SO_WIFI_STATUS:
2025	v.val = sock_flag(sk, flag: SOCK_WIFI_STATUS);
2026	break;
2027
2028	case SO_PEEK_OFF:
2029	if (!READ_ONCE(sock->ops)->set_peek_off)
2030	return -EOPNOTSUPP;
2031
2032	v.val = READ_ONCE(sk->sk_peek_off);
2033	break;
2034	case SO_NOFCS:
2035	v.val = sock_flag(sk, flag: SOCK_NOFCS);
2036	break;
2037
2038	case SO_BINDTODEVICE:
2039	return sock_getbindtodevice(sk, optval, optlen, len);
2040
2041	case SO_GET_FILTER:
2042	len = sk_get_filter(sk, optval, len);
2043	if (len < `0`)
2044	return len;
2045
2046	goto lenout;
2047
2048	case SO_LOCK_FILTER:
2049	v.val = sock_flag(sk, flag: SOCK_FILTER_LOCKED);
2050	break;
2051
2052	case SO_BPF_EXTENSIONS:
2053	v.val = bpf_tell_extensions();
2054	break;
2055
2056	case SO_SELECT_ERR_QUEUE:
2057	v.val = sock_flag(sk, flag: SOCK_SELECT_ERR_QUEUE);
2058	break;
2059
2060	#ifdef CONFIG_NET_RX_BUSY_POLL
2061	case SO_BUSY_POLL:
2062	v.val = READ_ONCE(sk->sk_ll_usec);
2063	break;
2064	case SO_PREFER_BUSY_POLL:
2065	v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2066	break;
2067	#endif
2068
2069	case SO_MAX_PACING_RATE:
2070	/ The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() /
2071	if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2072	lv = sizeof(v.ulval);
2073	v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2074	} else {
2075	/ 32bit version /
2076	v.val = min_t(unsigned long, ~`0U`,
2077	READ_ONCE(sk->sk_max_pacing_rate));
2078	}
2079	break;
2080
2081	case SO_INCOMING_CPU:
2082	v.val = READ_ONCE(sk->sk_incoming_cpu);
2083	break;
2084
2085	case SO_MEMINFO:
2086	{
2087	u32 meminfo[SK_MEMINFO_VARS];
2088
2089	sk_get_meminfo(sk, meminfo);
2090
2091	len = min_t(unsigned int, len, sizeof(meminfo));
2092	if (copy_to_sockptr(dst: optval, src: &meminfo, size: len))
2093	return -EFAULT;
2094
2095	goto lenout;
2096	}
2097
2098	#ifdef CONFIG_NET_RX_BUSY_POLL
2099	case SO_INCOMING_NAPI_ID:
2100	v.val = READ_ONCE(sk->sk_napi_id);
2101
2102	/ aggregate non-NAPI IDs down to 0 /
2103	if (!napi_id_valid(napi_id: v.val))
2104	v.val = `0`;
2105
2106	break;
2107	#endif
2108
2109	case SO_COOKIE:
2110	lv = sizeof(u64);
2111	if (len < lv)
2112	return -EINVAL;
2113	v.val64 = sock_gen_cookie(sk);
2114	break;
2115
2116	case SO_ZEROCOPY:
2117	v.val = sock_flag(sk, flag: SOCK_ZEROCOPY);
2118	break;
2119
2120	case SO_TXTIME:
2121	lv = sizeof(v.txtime);
2122	v.txtime.clockid = sk->sk_clockid;
2123	v.txtime.flags \|= sk->sk_txtime_deadline_mode ?
2124	SOF_TXTIME_DEADLINE_MODE : `0`;
2125	v.txtime.flags \|= sk->sk_txtime_report_errors ?
2126	SOF_TXTIME_REPORT_ERRORS : `0`;
2127	break;
2128
2129	case SO_BINDTOIFINDEX:
2130	v.val = READ_ONCE(sk->sk_bound_dev_if);
2131	break;
2132
2133	case SO_NETNS_COOKIE:
2134	lv = sizeof(u64);
2135	if (len != lv)
2136	return -EINVAL;
2137	v.val64 = sock_net(sk)->net_cookie;
2138	break;
2139
2140	case SO_BUF_LOCK:
2141	v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2142	break;
2143
2144	case SO_RESERVE_MEM:
2145	v.val = READ_ONCE(sk->sk_reserved_mem);
2146	break;
2147
2148	case SO_TXREHASH:
2149	if (!sk_is_tcp(sk))
2150	return -EOPNOTSUPP;
2151
2152	/ Paired with WRITE_ONCE() in sk_setsockopt() /
2153	v.val = READ_ONCE(sk->sk_txrehash);
2154	break;
2155
2156	default:
2157	/ We implement the SO_SNDLOWAT etc to not be settable*
2158	* (1003.1g 7).
2159	*/
2160	return -ENOPROTOOPT;
2161	}
2162
2163	if (len > lv)
2164	len = lv;
2165	if (copy_to_sockptr(dst: optval, src: &v, size: len))
2166	return -EFAULT;
2167	lenout:
2168	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
2169	return -EFAULT;
2170	return `0`;
2171	}
2172
2173	/*
2174	* Initialize an sk_lock.
2175	*
2176	* (We also register the sk_lock with the lock validator.)
2177	*/
2178	static inline void sock_lock_init(struct sock *sk)
2179	{
2180	sk_owner_clear(sk);
2181
2182	if (sk->sk_kern_sock)
2183	sock_lock_init_class_and_name(
2184	sk,
2185	af_family_kern_slock_key_strings[sk->sk_family],
2186	af_family_kern_slock_keys + sk->sk_family,
2187	af_family_kern_key_strings[sk->sk_family],
2188	af_family_kern_keys + sk->sk_family);
2189	else
2190	sock_lock_init_class_and_name(
2191	sk,
2192	af_family_slock_key_strings[sk->sk_family],
2193	af_family_slock_keys + sk->sk_family,
2194	af_family_key_strings[sk->sk_family],
2195	af_family_keys + sk->sk_family);
2196	}
2197
2198	/*
2199	* Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2200	* even temporarily, because of RCU lookups. sk_node should also be left as is.
2201	* We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2202	*/
2203	static void sock_copy(struct sock nsk, const* struct sock *osk)
2204	{
2205	const struct proto *prot = READ_ONCE(osk->sk_prot);
2206	#ifdef CONFIG_SECURITY_NETWORK
2207	void *sptr = nsk->sk_security;
2208	#endif
2209
2210	/ If we move sk_tx_queue_mapping out of the private section,*
2211	* we must check if sk_tx_queue_clear() is called after
2212	* sock_copy() in sk_clone_lock().
2213	*/
2214	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2215	offsetof(struct sock, sk_dontcopy_begin) \|\|
2216	offsetof(struct sock, sk_tx_queue_mapping) >=
2217	offsetof(struct sock, sk_dontcopy_end));
2218
2219	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2220
2221	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2222	prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2223	/ alloc is larger than struct, see sk_prot_alloc() /);
2224
2225	#ifdef CONFIG_SECURITY_NETWORK
2226	nsk->sk_security = sptr;
2227	security_sk_clone(sk: osk, newsk: nsk);
2228	#endif
2229	}
2230
2231	static struct sock sk_prot_alloc(struct* proto *prot, gfp_t priority,
2232	int family)
2233	{
2234	struct sock *sk;
2235	struct kmem_cache *slab;
2236
2237	slab = prot->slab;
2238	if (slab != NULL) {
2239	sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2240	if (!sk)
2241	return sk;
2242	if (want_init_on_alloc(flags: priority))
2243	sk_prot_clear_nulls(sk, size: prot->obj_size);
2244	} else
2245	sk = kmalloc(prot->obj_size, priority);
2246
2247	if (sk != NULL) {
2248	if (security_sk_alloc(sk, family, priority))
2249	goto out_free;
2250
2251	if (!try_module_get(module: prot->owner))
2252	goto out_free_sec;
2253	}
2254
2255	return sk;
2256
2257	out_free_sec:
2258	security_sk_free(sk);
2259	out_free:
2260	if (slab != NULL)
2261	kmem_cache_free(s: slab, objp: sk);
2262	else
2263	kfree(objp: sk);
2264	return NULL;
2265	}
2266
2267	static void sk_prot_free(struct proto prot, struct* sock *sk)
2268	{
2269	struct kmem_cache *slab;
2270	struct module *owner;
2271
2272	owner = prot->owner;
2273	slab = prot->slab;
2274
2275	cgroup_sk_free(skcd: &sk->sk_cgrp_data);
2276	mem_cgroup_sk_free(sk);
2277	security_sk_free(sk);
2278
2279	sk_owner_put(sk);
2280
2281	if (slab != NULL)
2282	kmem_cache_free(s: slab, objp: sk);
2283	else
2284	kfree(objp: sk);
2285	module_put(module: owner);
2286	}
2287
2288	/**
2289	* sk_alloc - All socket objects are allocated here
2290	* @net: the applicable net namespace
2291	* @family: protocol family
2292	* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2293	* @prot: struct proto associated with this new sock instance
2294	* @kern: is this to be a kernel socket?
2295	*/
2296	struct sock sk_alloc(struct* net net, int* family, gfp_t priority,
2297	struct proto prot, int* kern)
2298	{
2299	struct sock *sk;
2300
2301	sk = sk_prot_alloc(prot, priority: priority \| __GFP_ZERO, family);
2302	if (sk) {
2303	sk->sk_family = family;
2304	/*
2305	* See comment in struct sock definition to understand
2306	* why we need sk_prot_creator -acme
2307	*/
2308	sk->sk_prot = sk->sk_prot_creator = prot;
2309
2310	if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2311	sk->sk_bypass_prot_mem = `1`;
2312
2313	sk->sk_kern_sock = kern;
2314	sock_lock_init(sk);
2315
2316	sk->sk_net_refcnt = kern ? `0` : `1`;
2317	if (likely(sk->sk_net_refcnt)) {
2318	get_net_track(net, tracker: &sk->ns_tracker, gfp: priority);
2319	sock_inuse_add(net, val: `1`);
2320	} else {
2321	net_passive_inc(net);
2322	__netns_tracker_alloc(net, tracker: &sk->ns_tracker,
2323	refcounted: false, gfp: priority);
2324	}
2325
2326	sock_net_set(sk, net);
2327	refcount_set(r: &sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2328
2329	mem_cgroup_sk_alloc(sk);
2330	cgroup_sk_alloc(skcd: &sk->sk_cgrp_data);
2331	sock_update_classid(skcd: &sk->sk_cgrp_data);
2332	sock_update_netprioidx(skcd: &sk->sk_cgrp_data);
2333	sk_tx_queue_clear(sk);
2334	}
2335
2336	return sk;
2337	}
2338	EXPORT_SYMBOL(sk_alloc);
2339
2340	/ Sockets having SOCK_RCU_FREE will call this function after one RCU*
2341	* grace period. This is the case for UDP sockets and TCP listeners.
2342	*/
2343	static void __sk_destruct(struct rcu_head *head)
2344	{
2345	struct sock sk = container_of(head, struct* sock, sk_rcu);
2346	struct net *net = sock_net(sk);
2347	struct sk_filter *filter;
2348
2349	if (sk->sk_destruct)
2350	sk->sk_destruct(sk);
2351
2352	filter = rcu_dereference_check(sk->sk_filter,
2353	refcount_read(&sk->sk_wmem_alloc) == `0`);
2354	if (filter) {
2355	sk_filter_uncharge(sk, fp: filter);
2356	RCU_INIT_POINTER(sk->sk_filter, NULL);
2357	}
2358
2359	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2360
2361	#ifdef CONFIG_BPF_SYSCALL
2362	bpf_sk_storage_free(sk);
2363	#endif
2364
2365	if (atomic_read(v: &sk->sk_omem_alloc))
2366	pr_debug("%s: optmem leakage (%d bytes) detected\n",
2367	__func__, atomic_read(&sk->sk_omem_alloc));
2368
2369	if (sk->sk_frag.page) {
2370	put_page(page: sk->sk_frag.page);
2371	sk->sk_frag.page = NULL;
2372	}
2373
2374	/ We do not need to acquire sk->sk_peer_lock, we are the last user. /
2375	put_cred(cred: sk->sk_peer_cred);
2376	put_pid(pid: sk->sk_peer_pid);
2377
2378	if (likely(sk->sk_net_refcnt)) {
2379	put_net_track(net, tracker: &sk->ns_tracker);
2380	} else {
2381	__netns_tracker_free(net, tracker: &sk->ns_tracker, refcounted: false);
2382	net_passive_dec(net);
2383	}
2384	sk_prot_free(prot: sk->sk_prot_creator, sk);
2385	}
2386
2387	void sk_net_refcnt_upgrade(struct sock *sk)
2388	{
2389	struct net *net = sock_net(sk);
2390
2391	WARN_ON_ONCE(sk->sk_net_refcnt);
2392	__netns_tracker_free(net, tracker: &sk->ns_tracker, refcounted: false);
2393	net_passive_dec(net);
2394	sk->sk_net_refcnt = `1`;
2395	get_net_track(net, tracker: &sk->ns_tracker, GFP_KERNEL);
2396	sock_inuse_add(net, val: `1`);
2397	}
2398	EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2399
2400	void sk_destruct(struct sock *sk)
2401	{
2402	bool use_call_rcu = sock_flag(sk, flag: SOCK_RCU_FREE);
2403
2404	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2405	reuseport_detach_sock(sk);
2406	use_call_rcu = true;
2407	}
2408
2409	if (use_call_rcu)
2410	call_rcu(head: &sk->sk_rcu, func: __sk_destruct);
2411	else
2412	__sk_destruct(head: &sk->sk_rcu);
2413	}
2414
2415	static void __sk_free(struct sock *sk)
2416	{
2417	if (likely(sk->sk_net_refcnt))
2418	sock_inuse_add(net: sock_net(sk), val: -`1`);
2419
2420	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2421	sock_diag_broadcast_destroy(sk);
2422	else
2423	sk_destruct(sk);
2424	}
2425
2426	void sk_free(struct sock *sk)
2427	{
2428	/*
2429	* We subtract one from sk_wmem_alloc and can know if
2430	* some packets are still in some tx queue.
2431	* If not null, sock_wfree() will call __sk_free(sk) later
2432	*/
2433	if (refcount_dec_and_test(r: &sk->sk_wmem_alloc))
2434	__sk_free(sk);
2435	}
2436	EXPORT_SYMBOL(sk_free);
2437
2438	static void sk_init_common(struct sock *sk)
2439	{
2440	skb_queue_head_init(list: &sk->sk_receive_queue);
2441	skb_queue_head_init(list: &sk->sk_write_queue);
2442	skb_queue_head_init(list: &sk->sk_error_queue);
2443
2444	rwlock_init(&sk->sk_callback_lock);
2445	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2446	af_rlock_keys + sk->sk_family,
2447	af_family_rlock_key_strings[sk->sk_family]);
2448	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2449	af_wlock_keys + sk->sk_family,
2450	af_family_wlock_key_strings[sk->sk_family]);
2451	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2452	af_elock_keys + sk->sk_family,
2453	af_family_elock_key_strings[sk->sk_family]);
2454	if (sk->sk_kern_sock)
2455	lockdep_set_class_and_name(&sk->sk_callback_lock,
2456	af_kern_callback_keys + sk->sk_family,
2457	af_family_kern_clock_key_strings[sk->sk_family]);
2458	else
2459	lockdep_set_class_and_name(&sk->sk_callback_lock,
2460	af_callback_keys + sk->sk_family,
2461	af_family_clock_key_strings[sk->sk_family]);
2462	}
2463
2464	/**
2465	* sk_clone - clone a socket
2466	* @sk: the socket to clone
2467	* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2468	* @lock: if true, lock the cloned sk
2469	*
2470	* If @lock is true, the clone is locked by bh_lock_sock(), and
2471	* caller must unlock socket even in error path by bh_unlock_sock().
2472	*/
2473	struct sock sk_clone(const* struct sock sk, const* gfp_t priority,
2474	bool lock)
2475	{
2476	struct proto *prot = READ_ONCE(sk->sk_prot);
2477	struct sk_filter *filter;
2478	bool is_charged = true;
2479	struct sock *newsk;
2480
2481	newsk = sk_prot_alloc(prot, priority, family: sk->sk_family);
2482	if (!newsk)
2483	goto out;
2484
2485	sock_copy(nsk: newsk, osk: sk);
2486
2487	newsk->sk_prot_creator = prot;
2488
2489	/ SANITY /
2490	if (likely(newsk->sk_net_refcnt)) {
2491	get_net_track(net: sock_net(sk: newsk), tracker: &newsk->ns_tracker, gfp: priority);
2492	sock_inuse_add(net: sock_net(sk: newsk), val: `1`);
2493	} else {
2494	/ Kernel sockets are not elevating the struct net refcount.*
2495	* Instead, use a tracker to more easily detect if a layer
2496	* is not properly dismantling its kernel sockets at netns
2497	* destroy time.
2498	*/
2499	net_passive_inc(net: sock_net(sk: newsk));
2500	__netns_tracker_alloc(net: sock_net(sk: newsk), tracker: &newsk->ns_tracker,
2501	refcounted: false, gfp: priority);
2502	}
2503
2504	sk_node_init(node: &newsk->sk_node);
2505	sock_lock_init(sk: newsk);
2506
2507	if (lock)
2508	bh_lock_sock(newsk);
2509
2510	newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2511	newsk->sk_backlog.len = `0`;
2512
2513	atomic_set(v: &newsk->sk_rmem_alloc, i: `0`);
2514
2515	refcount_set(r: &newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2516
2517	atomic_set(v: &newsk->sk_omem_alloc, i: `0`);
2518	sk_init_common(sk: newsk);
2519
2520	newsk->sk_dst_cache = NULL;
2521	newsk->sk_dst_pending_confirm = `0`;
2522	newsk->sk_wmem_queued = `0`;
2523	newsk->sk_forward_alloc = `0`;
2524	newsk->sk_reserved_mem = `0`;
2525	DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
2526	sk_drops_reset(sk: newsk);
2527	newsk->sk_send_head = NULL;
2528	newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2529	atomic_set(v: &newsk->sk_zckey, i: `0`);
2530
2531	sock_reset_flag(sk: newsk, flag: SOCK_DONE);
2532
2533	#ifdef CONFIG_MEMCG
2534	/ sk->sk_memcg will be populated at accept() time /
2535	newsk->sk_memcg = NULL;
2536	#endif
2537
2538	cgroup_sk_clone(skcd: &newsk->sk_cgrp_data);
2539
2540	rcu_read_lock();
2541	filter = rcu_dereference(sk->sk_filter);
2542	if (filter != NULL)
2543	/ though it's an empty new sock, the charging may fail*
2544	* if sysctl_optmem_max was changed between creation of
2545	* original socket and cloning
2546	*/
2547	is_charged = sk_filter_charge(sk: newsk, fp: filter);
2548	RCU_INIT_POINTER(newsk->sk_filter, filter);
2549	rcu_read_unlock();
2550
2551	if (unlikely(!is_charged \|\| xfrm_sk_clone_policy(newsk, sk))) {
2552	/ We need to make sure that we don't uncharge the new*
2553	* socket if we couldn't charge it in the first place
2554	* as otherwise we uncharge the parent's filter.
2555	*/
2556	if (!is_charged)
2557	RCU_INIT_POINTER(newsk->sk_filter, NULL);
2558
2559	goto free;
2560	}
2561
2562	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2563
2564	if (bpf_sk_storage_clone(sk, newsk))
2565	goto free;
2566
2567	/ Clear sk_user_data if parent had the pointer tagged*
2568	* as not suitable for copying when cloning.
2569	*/
2570	if (sk_user_data_is_nocopy(sk: newsk))
2571	newsk->sk_user_data = NULL;
2572
2573	newsk->sk_err = `0`;
2574	newsk->sk_err_soft = `0`;
2575	newsk->sk_priority = `0`;
2576	newsk->sk_incoming_cpu = raw_smp_processor_id();
2577
2578	/ Before updating sk_refcnt, we must commit prior changes to memory*
2579	* (Documentation/RCU/rculist_nulls.rst for details)
2580	*/
2581	smp_wmb();
2582	refcount_set(r: &newsk->sk_refcnt, n: `2`);
2583
2584	sk_set_socket(sk: newsk, NULL);
2585	sk_tx_queue_clear(sk: newsk);
2586	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2587
2588	if (newsk->sk_prot->sockets_allocated)
2589	sk_sockets_allocated_inc(sk: newsk);
2590
2591	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2592	net_enable_timestamp();
2593	out:
2594	return newsk;
2595	free:
2596	/ It is still raw copy of parent, so invalidate*
2597	* destructor and make plain sk_free()
2598	*/
2599	newsk->sk_destruct = NULL;
2600	if (lock)
2601	bh_unlock_sock(newsk);
2602	sk_free(newsk);
2603	newsk = NULL;
2604	goto out;
2605	}
2606	EXPORT_SYMBOL_GPL(sk_clone);
2607
2608	static u32 sk_dst_gso_max_size(struct sock sk, const* struct net_device *dev)
2609	{
2610	bool is_ipv6 = false;
2611	u32 max_size;
2612
2613	#if IS_ENABLED(CONFIG_IPV6)
2614	is_ipv6 = (sk->sk_family == AF_INET6 &&
2615	!ipv6_addr_v4mapped(a: &sk->sk_v6_rcv_saddr));
2616	#endif
2617	/ pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() /
2618	max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
2619	READ_ONCE(dev->gso_ipv4_max_size);
2620	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2621	max_size = GSO_LEGACY_MAX_SIZE;
2622
2623	return max_size - (MAX_TCP_HEADER + `1`);
2624	}
2625
2626	void sk_setup_caps(struct sock sk, struct* dst_entry *dst)
2627	{
2628	const struct net_device *dev;
2629	u32 max_segs = `1`;
2630
2631	rcu_read_lock();
2632	dev = dst_dev_rcu(dst);
2633	sk->sk_route_caps = dev->features;
2634	if (sk_is_tcp(sk)) {
2635	struct inet_connection_sock *icsk = inet_csk(sk);
2636
2637	sk->sk_route_caps \|= NETIF_F_GSO;
2638	icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2639	}
2640	if (sk->sk_route_caps & NETIF_F_GSO)
2641	sk->sk_route_caps \|= NETIF_F_GSO_SOFTWARE;
2642	if (unlikely(sk->sk_gso_disabled))
2643	sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2644	if (sk_can_gso(sk)) {
2645	if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2646	sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2647	} else {
2648	sk->sk_route_caps \|= NETIF_F_SG \| NETIF_F_HW_CSUM;
2649	sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
2650	/ pairs with the WRITE_ONCE() in netif_set_gso_max_segs() /
2651	max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), `1`);
2652	}
2653	}
2654	sk->sk_gso_max_segs = max_segs;
2655	sk_dst_set(sk, dst);
2656	rcu_read_unlock();
2657	}
2658	EXPORT_SYMBOL_GPL(sk_setup_caps);
2659
2660	/*
2661	* Simple resource managers for sockets.
2662	*/
2663
2664
2665	/*
2666	* Write buffer destructor automatically called from kfree_skb.
2667	*/
2668	void sock_wfree(struct sk_buff *skb)
2669	{
2670	unsigned int len = skb->truesize;
2671	struct sock *sk = skb->sk;
2672	bool free;
2673	int old;
2674
2675	if (!sock_flag(sk, flag: SOCK_USE_WRITE_QUEUE)) {
2676	if (sock_flag(sk, flag: SOCK_RCU_FREE) &&
2677	sk->sk_write_space == sock_def_write_space) {
2678	rcu_read_lock();
2679	free = __refcount_sub_and_test(i: len, r: &sk->sk_wmem_alloc,
2680	oldp: &old);
2681	sock_def_write_space_wfree(sk, wmem_alloc: old - len);
2682	rcu_read_unlock();
2683	if (unlikely(free))
2684	__sk_free(sk);
2685	return;
2686	}
2687
2688	/*
2689	* Keep a reference on sk_wmem_alloc, this will be released
2690	* after sk_write_space() call
2691	*/
2692	WARN_ON(refcount_sub_and_test(len - `1`, &sk->sk_wmem_alloc));
2693	sk->sk_write_space(sk);
2694	len = `1`;
2695	}
2696	/*
2697	* if sk_wmem_alloc reaches 0, we must finish what sk_free()
2698	* could not do because of in-flight packets
2699	*/
2700	if (refcount_sub_and_test(i: len, r: &sk->sk_wmem_alloc))
2701	__sk_free(sk);
2702	}
2703	EXPORT_SYMBOL(sock_wfree);
2704
2705	/ This variant of sock_wfree() is used by TCP,*
2706	* since it sets SOCK_USE_WRITE_QUEUE.
2707	*/
2708	void __sock_wfree(struct sk_buff *skb)
2709	{
2710	struct sock *sk = skb->sk;
2711
2712	if (refcount_sub_and_test(i: skb->truesize, r: &sk->sk_wmem_alloc))
2713	__sk_free(sk);
2714	}
2715
2716	void skb_set_owner_w(struct sk_buff skb, struct* sock *sk)
2717	{
2718	int old_wmem;
2719
2720	skb_orphan(skb);
2721	#ifdef CONFIG_INET
2722	if (unlikely(!sk_fullsock(sk)))
2723	return skb_set_owner_edemux(skb, sk);
2724	#endif
2725	skb->sk = sk;
2726	skb->destructor = sock_wfree;
2727	skb_set_hash_from_sk(skb, sk);
2728	/*
2729	* We used to take a refcount on sk, but following operation
2730	* is enough to guarantee sk_free() won't free this sock until
2731	* all in-flight packets are completed
2732	*/
2733	__refcount_add(i: skb->truesize, r: &sk->sk_wmem_alloc, oldp: &old_wmem);
2734
2735	/ (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket*
2736	* is in a host queue (qdisc, NIC queue).
2737	* Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
2738	* based on XPS for better performance.
2739	* Otherwise clear ooo_okay to not risk Out Of Order delivery.
2740	*/
2741	skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
2742	}
2743	EXPORT_SYMBOL(skb_set_owner_w);
2744
2745	static bool can_skb_orphan_partial(const struct sk_buff *skb)
2746	{
2747	/ Drivers depend on in-order delivery for crypto offload,*
2748	* partial orphan breaks out-of-order-OK logic.
2749	*/
2750	if (skb_is_decrypted(skb))
2751	return false;
2752
2753	return (skb->destructor == sock_wfree \|\|
2754	(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2755	}
2756
2757	/ This helper is used by netem, as it can hold packets in its*
2758	* delay queue. We want to allow the owner socket to send more
2759	* packets, as if they were already TX completed by a typical driver.
2760	* But we also want to keep skb->sk set because some packet schedulers
2761	* rely on it (sch_fq for example).
2762	*/
2763	void skb_orphan_partial(struct sk_buff *skb)
2764	{
2765	if (skb_is_tcp_pure_ack(skb))
2766	return;
2767
2768	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, sk: skb->sk))
2769	return;
2770
2771	skb_orphan(skb);
2772	}
2773	EXPORT_SYMBOL(skb_orphan_partial);
2774
2775	/*
2776	* Read buffer destructor automatically called from kfree_skb.
2777	*/
2778	void sock_rfree(struct sk_buff *skb)
2779	{
2780	struct sock *sk = skb->sk;
2781	unsigned int len = skb->truesize;
2782
2783	atomic_sub(i: len, v: &sk->sk_rmem_alloc);
2784	sk_mem_uncharge(sk, size: len);
2785	}
2786	EXPORT_SYMBOL(sock_rfree);
2787
2788	/*
2789	* Buffer destructor for skbs that are not used directly in read or write
2790	* path, e.g. for error handler skbs. Automatically called from kfree_skb.
2791	*/
2792	void sock_efree(struct sk_buff *skb)
2793	{
2794	sock_put(sk: skb->sk);
2795	}
2796	EXPORT_SYMBOL(sock_efree);
2797
2798	/ Buffer destructor for prefetch/receive path where reference count may*
2799	* not be held, e.g. for listen sockets.
2800	*/
2801	#ifdef CONFIG_INET
2802	void sock_pfree(struct sk_buff *skb)
2803	{
2804	struct sock *sk = skb->sk;
2805
2806	if (!sk_is_refcounted(sk))
2807	return;
2808
2809	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2810	inet_reqsk(sk)->rsk_listener = NULL;
2811	reqsk_free(req: inet_reqsk(sk));
2812	return;
2813	}
2814
2815	sock_gen_put(sk);
2816	}
2817	EXPORT_SYMBOL(sock_pfree);
2818	#endif /* CONFIG_INET */
2819
2820	/*
2821	* Allocate a skb from the socket's send buffer.
2822	*/
2823	struct sk_buff sock_wmalloc(struct* sock sk, unsigned* long size, int force,
2824	gfp_t priority)
2825	{
2826	if (force \|\|
2827	refcount_read(r: &sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2828	struct sk_buff *skb = alloc_skb(size, priority);
2829
2830	if (skb) {
2831	skb_set_owner_w(skb, sk);
2832	return skb;
2833	}
2834	}
2835	return NULL;
2836	}
2837	EXPORT_SYMBOL(sock_wmalloc);
2838
2839	static void sock_ofree(struct sk_buff *skb)
2840	{
2841	struct sock *sk = skb->sk;
2842
2843	atomic_sub(i: skb->truesize, v: &sk->sk_omem_alloc);
2844	}
2845
2846	struct sk_buff sock_omalloc(struct* sock sk, unsigned* long size,
2847	gfp_t priority)
2848	{
2849	struct sk_buff *skb;
2850
2851	/ small safe race: SKB_TRUESIZE may differ from final skb->truesize /
2852	if (atomic_read(v: &sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2853	READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2854	return NULL;
2855
2856	skb = alloc_skb(size, priority);
2857	if (!skb)
2858	return NULL;
2859
2860	atomic_add(i: skb->truesize, v: &sk->sk_omem_alloc);
2861	skb->sk = sk;
2862	skb->destructor = sock_ofree;
2863	return skb;
2864	}
2865
2866	/*
2867	* Allocate a memory block from the socket's option memory buffer.
2868	*/
2869	void sock_kmalloc(struct* sock sk, int* size, gfp_t priority)
2870	{
2871	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2872
2873	if ((unsigned int)size <= optmem_max &&
2874	atomic_read(v: &sk->sk_omem_alloc) + size < optmem_max) {
2875	void *mem;
2876	/ First do the add, to avoid the race if kmalloc*
2877	* might sleep.
2878	*/
2879	atomic_add(i: size, v: &sk->sk_omem_alloc);
2880	mem = kmalloc(size, priority);
2881	if (mem)
2882	return mem;
2883	atomic_sub(i: size, v: &sk->sk_omem_alloc);
2884	}
2885	return NULL;
2886	}
2887	EXPORT_SYMBOL(sock_kmalloc);
2888
2889	/*
2890	* Duplicate the input "src" memory block using the socket's
2891	* option memory buffer.
2892	*/
2893	void sock_kmemdup(struct* sock sk, const* void *src,
2894	int size, gfp_t priority)
2895	{
2896	void *mem;
2897
2898	mem = sock_kmalloc(sk, size, priority);
2899	if (mem)
2900	memcpy(mem, src, size);
2901	return mem;
2902	}
2903	EXPORT_SYMBOL(sock_kmemdup);
2904
2905	/ Free an option memory block. Note, we actually want the inline*
2906	* here as this allows gcc to detect the nullify and fold away the
2907	* condition entirely.
2908	*/
2909	static inline void __sock_kfree_s(struct sock sk, void* mem, int* size,
2910	const bool nullify)
2911	{
2912	if (WARN_ON_ONCE(!mem))
2913	return;
2914	if (nullify)
2915	kfree_sensitive(objp: mem);
2916	else
2917	kfree(objp: mem);
2918	atomic_sub(i: size, v: &sk->sk_omem_alloc);
2919	}
2920
2921	void sock_kfree_s(struct sock sk, void* mem, int* size)
2922	{
2923	__sock_kfree_s(sk, mem, size, nullify: false);
2924	}
2925	EXPORT_SYMBOL(sock_kfree_s);
2926
2927	void sock_kzfree_s(struct sock sk, void* mem, int* size)
2928	{
2929	__sock_kfree_s(sk, mem, size, nullify: true);
2930	}
2931	EXPORT_SYMBOL(sock_kzfree_s);
2932
2933	/ It is almost wait_for_tcp_memory minus release_sock/lock_sock.*
2934	I think, these locks should be removed for datagram sockets.
2935	*/
2936	static long sock_wait_for_wmem(struct sock sk, long* timeo)
2937	{
2938	DEFINE_WAIT(wait);
2939
2940	sk_clear_bit(nr: SOCKWQ_ASYNC_NOSPACE, sk);
2941	for (;;) {
2942	if (!timeo)
2943	break;
2944	if (signal_pending(current))
2945	break;
2946	set_bit(nr: SOCK_NOSPACE, addr: &sk->sk_socket->flags);
2947	prepare_to_wait(wq_head: sk_sleep(sk), wq_entry: &wait, TASK_INTERRUPTIBLE);
2948	if (refcount_read(r: &sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2949	break;
2950	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2951	break;
2952	if (READ_ONCE(sk->sk_err))
2953	break;
2954	timeo = schedule_timeout(timeout: timeo);
2955	}
2956	finish_wait(wq_head: sk_sleep(sk), wq_entry: &wait);
2957	return timeo;
2958	}
2959
2960
2961	/*
2962	* Generic send/receive buffer handlers
2963	*/
2964
2965	struct sk_buff sock_alloc_send_pskb(struct* sock sk, unsigned* long header_len,
2966	unsigned long data_len, int noblock,
2967	int errcode, int* max_page_order)
2968	{
2969	struct sk_buff *skb;
2970	long timeo;
2971	int err;
2972
2973	timeo = sock_sndtimeo(sk, noblock);
2974	for (;;) {
2975	err = sock_error(sk);
2976	if (err != `0`)
2977	goto failure;
2978
2979	err = -EPIPE;
2980	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2981	goto failure;
2982
2983	if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2984	break;
2985
2986	sk_set_bit(nr: SOCKWQ_ASYNC_NOSPACE, sk);
2987	set_bit(nr: SOCK_NOSPACE, addr: &sk->sk_socket->flags);
2988	err = -EAGAIN;
2989	if (!timeo)
2990	goto failure;
2991	if (signal_pending(current))
2992	goto interrupted;
2993	timeo = sock_wait_for_wmem(sk, timeo);
2994	}
2995	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2996	errcode, gfp_mask: sk->sk_allocation);
2997	if (skb)
2998	skb_set_owner_w(skb, sk);
2999	return skb;
3000
3001	interrupted:
3002	err = sock_intr_errno(timeo);
3003	failure:
3004	*errcode = err;
3005	return NULL;
3006	}
3007	EXPORT_SYMBOL(sock_alloc_send_pskb);
3008
3009	int __sock_cmsg_send(struct sock sk, struct* cmsghdr *cmsg,
3010	struct sockcm_cookie *sockc)
3011	{
3012	u32 tsflags;
3013
3014	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (`1` << `31`));
3015
3016	switch (cmsg->cmsg_type) {
3017	case SO_MARK:
3018	if (!ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_RAW) &&
3019	!ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_ADMIN))
3020	return -EPERM;
3021	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3022	return -EINVAL;
3023	sockc->mark = (u32 )CMSG_DATA(cmsg);
3024	break;
3025	case SO_TIMESTAMPING_OLD:
3026	case SO_TIMESTAMPING_NEW:
3027	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3028	return -EINVAL;
3029
3030	tsflags = (u32 )CMSG_DATA(cmsg);
3031	if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3032	return -EINVAL;
3033
3034	sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3035	sockc->tsflags \|= tsflags;
3036	break;
3037	case SCM_TXTIME:
3038	if (!sock_flag(sk, flag: SOCK_TXTIME))
3039	return -EINVAL;
3040	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3041	return -EINVAL;
3042	sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3043	break;
3044	case SCM_TS_OPT_ID:
3045	if (sk_is_tcp(sk))
3046	return -EINVAL;
3047	tsflags = READ_ONCE(sk->sk_tsflags);
3048	if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3049	return -EINVAL;
3050	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3051	return -EINVAL;
3052	sockc->ts_opt_id = (u32 )CMSG_DATA(cmsg);
3053	sockc->tsflags \|= SOCKCM_FLAG_TS_OPT_ID;
3054	break;
3055	/ SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. /
3056	case SCM_RIGHTS:
3057	case SCM_CREDENTIALS:
3058	break;
3059	case SO_PRIORITY:
3060	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3061	return -EINVAL;
3062	if (!sk_set_prio_allowed(sk, val: (u32 )CMSG_DATA(cmsg)))
3063	return -EPERM;
3064	sockc->priority = (u32 )CMSG_DATA(cmsg);
3065	break;
3066	case SCM_DEVMEM_DMABUF:
3067	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3068	return -EINVAL;
3069	sockc->dmabuf_id = (u32 )CMSG_DATA(cmsg);
3070	break;
3071	default:
3072	return -EINVAL;
3073	}
3074	return `0`;
3075	}
3076	EXPORT_SYMBOL(__sock_cmsg_send);
3077
3078	int sock_cmsg_send(struct sock sk, struct* msghdr *msg,
3079	struct sockcm_cookie *sockc)
3080	{
3081	struct cmsghdr *cmsg;
3082	int ret;
3083
3084	for_each_cmsghdr(cmsg, msg) {
3085	if (!CMSG_OK(msg, cmsg))
3086	return -EINVAL;
3087	if (cmsg->cmsg_level != SOL_SOCKET)
3088	continue;
3089	ret = __sock_cmsg_send(sk, cmsg, sockc);
3090	if (ret)
3091	return ret;
3092	}
3093	return `0`;
3094	}
3095	EXPORT_SYMBOL(sock_cmsg_send);
3096
3097	static void sk_enter_memory_pressure(struct sock *sk)
3098	{
3099	if (!sk->sk_prot->enter_memory_pressure)
3100	return;
3101
3102	sk->sk_prot->enter_memory_pressure(sk);
3103	}
3104
3105	static void sk_leave_memory_pressure(struct sock *sk)
3106	{
3107	if (sk->sk_prot->leave_memory_pressure) {
3108	INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3109	tcp_leave_memory_pressure, sk);
3110	} else {
3111	unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3112
3113	if (memory_pressure && READ_ONCE(*memory_pressure))
3114	WRITE_ONCE(*memory_pressure, `0`);
3115	}
3116	}
3117
3118	DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3119
3120	/**
3121	* skb_page_frag_refill - check that a page_frag contains enough room
3122	* @sz: minimum size of the fragment we want to get
3123	* @pfrag: pointer to page_frag
3124	* @gfp: priority for memory allocation
3125	*
3126	* Note: While this allocator tries to use high order pages, there is
3127	* no guarantee that allocations succeed. Therefore, @sz MUST be
3128	* less or equal than PAGE_SIZE.
3129	*/
3130	bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3131	{
3132	if (pfrag->page) {
3133	if (page_ref_count(page: pfrag->page) == `1`) {
3134	pfrag->offset = `0`;
3135	return true;
3136	}
3137	if (pfrag->offset + sz <= pfrag->size)
3138	return true;
3139	put_page(page: pfrag->page);
3140	}
3141
3142	pfrag->offset = `0`;
3143	if (SKB_FRAG_PAGE_ORDER &&
3144	!static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3145	/ Avoid direct reclaim but allow kswapd to wake /
3146	pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) \|
3147	__GFP_COMP \| __GFP_NOWARN \|
3148	__GFP_NORETRY,
3149	SKB_FRAG_PAGE_ORDER);
3150	if (likely(pfrag->page)) {
3151	pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3152	return true;
3153	}
3154	}
3155	pfrag->page = alloc_page(gfp);
3156	if (likely(pfrag->page)) {
3157	pfrag->size = PAGE_SIZE;
3158	return true;
3159	}
3160	return false;
3161	}
3162	EXPORT_SYMBOL(skb_page_frag_refill);
3163
3164	bool sk_page_frag_refill(struct sock sk, struct* page_frag *pfrag)
3165	{
3166	if (likely(skb_page_frag_refill(`32U`, pfrag, sk->sk_allocation)))
3167	return true;
3168
3169	if (!sk->sk_bypass_prot_mem)
3170	sk_enter_memory_pressure(sk);
3171
3172	sk_stream_moderate_sndbuf(sk);
3173
3174	return false;
3175	}
3176	EXPORT_SYMBOL(sk_page_frag_refill);
3177
3178	void __lock_sock(struct sock *sk)
3179	__releases(&sk->sk_lock.slock)
3180	__acquires(&sk->sk_lock.slock)
3181	{
3182	DEFINE_WAIT(wait);
3183
3184	for (;;) {
3185	prepare_to_wait_exclusive(wq_head: &sk->sk_lock.wq, wq_entry: &wait,
3186	TASK_UNINTERRUPTIBLE);
3187	spin_unlock_bh(lock: &sk->sk_lock.slock);
3188	schedule();
3189	spin_lock_bh(lock: &sk->sk_lock.slock);
3190	if (!sock_owned_by_user(sk))
3191	break;
3192	}
3193	finish_wait(wq_head: &sk->sk_lock.wq, wq_entry: &wait);
3194	}
3195
3196	void __release_sock(struct sock *sk)
3197	__releases(&sk->sk_lock.slock)
3198	__acquires(&sk->sk_lock.slock)
3199	{
3200	struct sk_buff skb, next;
3201	int nb = `0`;
3202
3203	while ((skb = sk->sk_backlog.head) != NULL) {
3204	sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3205
3206	spin_unlock_bh(lock: &sk->sk_lock.slock);
3207
3208	while (`1`) {
3209	next = skb->next;
3210	prefetch(next);
3211	DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3212	skb_mark_not_on_list(skb);
3213	sk_backlog_rcv(sk, skb);
3214
3215	skb = next;
3216	if (!skb)
3217	break;
3218
3219	if (!(++nb & `15`))
3220	cond_resched();
3221	}
3222
3223	spin_lock_bh(lock: &sk->sk_lock.slock);
3224	}
3225
3226	/*
3227	* Doing the zeroing here guarantee we can not loop forever
3228	* while a wild producer attempts to flood us.
3229	*/
3230	sk->sk_backlog.len = `0`;
3231	}
3232
3233	void __sk_flush_backlog(struct sock *sk)
3234	{
3235	spin_lock_bh(lock: &sk->sk_lock.slock);
3236	__release_sock(sk);
3237
3238	if (sk->sk_prot->release_cb)
3239	INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3240	tcp_release_cb, sk);
3241
3242	spin_unlock_bh(lock: &sk->sk_lock.slock);
3243	}
3244	EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3245
3246	/**
3247	* sk_wait_data - wait for data to arrive at sk_receive_queue
3248	* @sk: sock to wait on
3249	* @timeo: for how long
3250	* @skb: last skb seen on sk_receive_queue
3251	*
3252	* Now socket state including sk->sk_err is changed only under lock,
3253	* hence we may omit checks after joining wait queue.
3254	* We check receive queue before schedule() only as optimization;
3255	* it is very likely that release_sock() added new data.
3256	*/
3257	int sk_wait_data(struct sock sk, long* timeo, const* struct sk_buff *skb)
3258	{
3259	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3260	int rc;
3261
3262	add_wait_queue(wq_head: sk_sleep(sk), wq_entry: &wait);
3263	sk_set_bit(nr: SOCKWQ_ASYNC_WAITDATA, sk);
3264	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3265	sk_clear_bit(nr: SOCKWQ_ASYNC_WAITDATA, sk);
3266	remove_wait_queue(wq_head: sk_sleep(sk), wq_entry: &wait);
3267	return rc;
3268	}
3269	EXPORT_SYMBOL(sk_wait_data);
3270
3271	/**
3272	* __sk_mem_raise_allocated - increase memory_allocated
3273	* @sk: socket
3274	* @size: memory size to allocate
3275	* @amt: pages to allocate
3276	* @kind: allocation type
3277	*
3278	* Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3279	*
3280	* Unlike the globally shared limits among the sockets under same protocol,
3281	* consuming the budget of a memcg won't have direct effect on other ones.
3282	* So be optimistic about memcg's tolerance, and leave the callers to decide
3283	* whether or not to raise allocated through sk_under_memory_pressure() or
3284	* its variants.
3285	*/
3286	int __sk_mem_raise_allocated(struct sock sk, int* size, int amt, int kind)
3287	{
3288	bool memcg_enabled = false, charged = false;
3289	struct proto *prot = sk->sk_prot;
3290	long allocated = `0`;
3291
3292	if (!sk->sk_bypass_prot_mem) {
3293	sk_memory_allocated_add(sk, val: amt);
3294	allocated = sk_memory_allocated(sk);
3295	}
3296
3297	if (mem_cgroup_sk_enabled(sk)) {
3298	memcg_enabled = true;
3299	charged = mem_cgroup_sk_charge(sk, nr_pages: amt, gfp_mask: gfp_memcg_charge());
3300	if (!charged)
3301	goto suppress_allocation;
3302	}
3303
3304	if (!allocated)
3305	return `1`;
3306
3307	/ Under limit. /
3308	if (allocated <= sk_prot_mem_limits(sk, index: `0`)) {
3309	sk_leave_memory_pressure(sk);
3310	return `1`;
3311	}
3312
3313	/ Under pressure. /
3314	if (allocated > sk_prot_mem_limits(sk, index: `1`))
3315	sk_enter_memory_pressure(sk);
3316
3317	/ Over hard limit. /
3318	if (allocated > sk_prot_mem_limits(sk, index: `2`))
3319	goto suppress_allocation;
3320
3321	/ Guarantee minimum buffer size under pressure (either global*
3322	* or memcg) to make sure features described in RFC 7323 (TCP
3323	* Extensions for High Performance) work properly.
3324	*
3325	* This rule does NOT stand when exceeds global or memcg's hard
3326	* limit, or else a DoS attack can be taken place by spawning
3327	* lots of sockets whose usage are under minimum buffer size.
3328	*/
3329	if (kind == SK_MEM_RECV) {
3330	if (atomic_read(v: &sk->sk_rmem_alloc) < sk_get_rmem0(sk, proto: prot))
3331	return `1`;
3332
3333	} else { / SK_MEM_SEND /
3334	int wmem0 = sk_get_wmem0(sk, proto: prot);
3335
3336	if (sk->sk_type == SOCK_STREAM) {
3337	if (sk->sk_wmem_queued < wmem0)
3338	return `1`;
3339	} else if (refcount_read(r: &sk->sk_wmem_alloc) < wmem0) {
3340	return `1`;
3341	}
3342	}
3343
3344	if (sk_has_memory_pressure(sk)) {
3345	u64 alloc;
3346
3347	/ The following 'average' heuristic is within the*
3348	* scope of global accounting, so it only makes
3349	* sense for global memory pressure.
3350	*/
3351	if (!sk_under_global_memory_pressure(sk))
3352	return `1`;
3353
3354	/ Try to be fair among all the sockets under global*
3355	* pressure by allowing the ones that below average
3356	* usage to raise.
3357	*/
3358	alloc = sk_sockets_allocated_read_positive(sk);
3359	if (sk_prot_mem_limits(sk, index: `2`) > alloc *
3360	sk_mem_pages(amt: sk->sk_wmem_queued +
3361	atomic_read(v: &sk->sk_rmem_alloc) +
3362	sk->sk_forward_alloc))
3363	return `1`;
3364	}
3365
3366	suppress_allocation:
3367
3368	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3369	sk_stream_moderate_sndbuf(sk);
3370
3371	/ Fail only if socket is _under_ its sndbuf.*
3372	* In this case we cannot block, so that we have to fail.
3373	*/
3374	if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3375	/ Force charge with __GFP_NOFAIL /
3376	if (memcg_enabled && !charged)
3377	mem_cgroup_sk_charge(sk, nr_pages: amt,
3378	gfp_mask: gfp_memcg_charge() \| __GFP_NOFAIL);
3379	return `1`;
3380	}
3381	}
3382
3383	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3384
3385	if (allocated)
3386	sk_memory_allocated_sub(sk, val: amt);
3387
3388	if (charged)
3389	mem_cgroup_sk_uncharge(sk, nr_pages: amt);
3390
3391	return `0`;
3392	}
3393
3394	/**
3395	* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3396	* @sk: socket
3397	* @size: memory size to allocate
3398	* @kind: allocation type
3399	*
3400	* If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3401	* rmem allocation. This function assumes that protocols which have
3402	* memory_pressure use sk_wmem_queued as write buffer accounting.
3403	*/
3404	int __sk_mem_schedule(struct sock sk, int* size, int kind)
3405	{
3406	int ret, amt = sk_mem_pages(amt: size);
3407
3408	sk_forward_alloc_add(sk, val: amt << PAGE_SHIFT);
3409	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3410	if (!ret)
3411	sk_forward_alloc_add(sk, val: -(amt << PAGE_SHIFT));
3412	return ret;
3413	}
3414	EXPORT_SYMBOL(__sk_mem_schedule);
3415
3416	/**
3417	* __sk_mem_reduce_allocated - reclaim memory_allocated
3418	* @sk: socket
3419	* @amount: number of quanta
3420	*
3421	* Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3422	*/
3423	void __sk_mem_reduce_allocated(struct sock sk, int* amount)
3424	{
3425	if (mem_cgroup_sk_enabled(sk))
3426	mem_cgroup_sk_uncharge(sk, nr_pages: amount);
3427
3428	if (sk->sk_bypass_prot_mem)
3429	return;
3430
3431	sk_memory_allocated_sub(sk, val: amount);
3432
3433	if (sk_under_global_memory_pressure(sk) &&
3434	(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, index: `0`)))
3435	sk_leave_memory_pressure(sk);
3436	}
3437
3438	/**
3439	* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3440	* @sk: socket
3441	* @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3442	*/
3443	void __sk_mem_reclaim(struct sock sk, int* amount)
3444	{
3445	amount >>= PAGE_SHIFT;
3446	sk_forward_alloc_add(sk, val: -(amount << PAGE_SHIFT));
3447	__sk_mem_reduce_allocated(sk, amount);
3448	}
3449	EXPORT_SYMBOL(__sk_mem_reclaim);
3450
3451	void __sk_charge(struct sock *sk, gfp_t gfp)
3452	{
3453	int amt;
3454
3455	gfp \|= __GFP_NOFAIL;
3456	if (mem_cgroup_from_sk(sk)) {
3457	/ The socket has not been accepted yet, no need*
3458	* to look at newsk->sk_wmem_queued.
3459	*/
3460	amt = sk_mem_pages(amt: sk->sk_forward_alloc +
3461	atomic_read(v: &sk->sk_rmem_alloc));
3462	if (amt)
3463	mem_cgroup_sk_charge(sk, nr_pages: amt, gfp_mask: gfp);
3464	}
3465
3466	kmem_cache_charge(objp: sk, gfpflags: gfp);
3467	}
3468
3469	int sk_set_peek_off(struct sock sk, int* val)
3470	{
3471	WRITE_ONCE(sk->sk_peek_off, val);
3472	return `0`;
3473	}
3474	EXPORT_SYMBOL_GPL(sk_set_peek_off);
3475
3476	/*
3477	* Set of default routines for initialising struct proto_ops when
3478	* the protocol does not support a particular function. In certain
3479	* cases where it makes no sense for a protocol to have a "do nothing"
3480	* function, some default processing is provided.
3481	*/
3482
3483	int sock_no_bind(struct socket sock, struct* sockaddr_unsized saddr, int* len)
3484	{
3485	return -EOPNOTSUPP;
3486	}
3487	EXPORT_SYMBOL(sock_no_bind);
3488
3489	int sock_no_connect(struct socket sock, struct* sockaddr_unsized *saddr,
3490	int len, int flags)
3491	{
3492	return -EOPNOTSUPP;
3493	}
3494	EXPORT_SYMBOL(sock_no_connect);
3495
3496	int sock_no_socketpair(struct socket sock1, struct* socket *sock2)
3497	{
3498	return -EOPNOTSUPP;
3499	}
3500	EXPORT_SYMBOL(sock_no_socketpair);
3501
3502	int sock_no_accept(struct socket sock, struct* socket *newsock,
3503	struct proto_accept_arg *arg)
3504	{
3505	return -EOPNOTSUPP;
3506	}
3507	EXPORT_SYMBOL(sock_no_accept);
3508
3509	int sock_no_getname(struct socket sock, struct* sockaddr *saddr,
3510	int peer)
3511	{
3512	return -EOPNOTSUPP;
3513	}
3514	EXPORT_SYMBOL(sock_no_getname);
3515
3516	int sock_no_ioctl(struct socket sock, unsigned* int cmd, unsigned long arg)
3517	{
3518	return -EOPNOTSUPP;
3519	}
3520	EXPORT_SYMBOL(sock_no_ioctl);
3521
3522	int sock_no_listen(struct socket sock, int* backlog)
3523	{
3524	return -EOPNOTSUPP;
3525	}
3526	EXPORT_SYMBOL(sock_no_listen);
3527
3528	int sock_no_shutdown(struct socket sock, int* how)
3529	{
3530	return -EOPNOTSUPP;
3531	}
3532	EXPORT_SYMBOL(sock_no_shutdown);
3533
3534	int sock_no_sendmsg(struct socket sock, struct* msghdr *m, size_t len)
3535	{
3536	return -EOPNOTSUPP;
3537	}
3538	EXPORT_SYMBOL(sock_no_sendmsg);
3539
3540	int sock_no_sendmsg_locked(struct sock sk, struct* msghdr *m, size_t len)
3541	{
3542	return -EOPNOTSUPP;
3543	}
3544	EXPORT_SYMBOL(sock_no_sendmsg_locked);
3545
3546	int sock_no_recvmsg(struct socket sock, struct* msghdr *m, size_t len,
3547	int flags)
3548	{
3549	return -EOPNOTSUPP;
3550	}
3551	EXPORT_SYMBOL(sock_no_recvmsg);
3552
3553	int sock_no_mmap(struct file file, struct* socket sock, struct* vm_area_struct *vma)
3554	{
3555	/ Mirror missing mmap method error code /
3556	return -ENODEV;
3557	}
3558	EXPORT_SYMBOL(sock_no_mmap);
3559
3560	/*
3561	* When a file is received (via SCM_RIGHTS, etc), we must bump the
3562	* various sock-based usage counts.
3563	*/
3564	void __receive_sock(struct file *file)
3565	{
3566	struct socket *sock;
3567
3568	sock = sock_from_file(file);
3569	if (sock) {
3570	sock_update_netprioidx(skcd: &sock->sk->sk_cgrp_data);
3571	sock_update_classid(skcd: &sock->sk->sk_cgrp_data);
3572	}
3573	}
3574
3575	/*
3576	* Default Socket Callbacks
3577	*/
3578
3579	static void sock_def_wakeup(struct sock *sk)
3580	{
3581	struct socket_wq *wq;
3582
3583	rcu_read_lock();
3584	wq = rcu_dereference(sk->sk_wq);
3585	if (skwq_has_sleeper(wq))
3586	wake_up_interruptible_all(&wq->wait);
3587	rcu_read_unlock();
3588	}
3589
3590	static void sock_def_error_report(struct sock *sk)
3591	{
3592	struct socket_wq *wq;
3593
3594	rcu_read_lock();
3595	wq = rcu_dereference(sk->sk_wq);
3596	if (skwq_has_sleeper(wq))
3597	wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3598	sk_wake_async_rcu(sk, how: SOCK_WAKE_IO, POLL_ERR);
3599	rcu_read_unlock();
3600	}
3601
3602	void sock_def_readable(struct sock *sk)
3603	{
3604	struct socket_wq *wq;
3605
3606	trace_sk_data_ready(sk);
3607
3608	rcu_read_lock();
3609	wq = rcu_dereference(sk->sk_wq);
3610	if (skwq_has_sleeper(wq))
3611	wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN \| EPOLLPRI \|
3612	EPOLLRDNORM \| EPOLLRDBAND);
3613	sk_wake_async_rcu(sk, how: SOCK_WAKE_WAITD, POLL_IN);
3614	rcu_read_unlock();
3615	}
3616
3617	static void sock_def_write_space(struct sock *sk)
3618	{
3619	struct socket_wq *wq;
3620
3621	rcu_read_lock();
3622
3623	/ Do not wake up a writer until he can make "significant"*
3624	* progress. --DaveM
3625	*/
3626	if (sock_writeable(sk)) {
3627	wq = rcu_dereference(sk->sk_wq);
3628	if (skwq_has_sleeper(wq))
3629	wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT \|
3630	EPOLLWRNORM \| EPOLLWRBAND);
3631
3632	/ Should agree with poll, otherwise some programs break /
3633	sk_wake_async_rcu(sk, how: SOCK_WAKE_SPACE, POLL_OUT);
3634	}
3635
3636	rcu_read_unlock();
3637	}
3638
3639	/ An optimised version of sock_def_write_space(), should only be called*
3640	* for SOCK_RCU_FREE sockets under RCU read section and after putting
3641	* ->sk_wmem_alloc.
3642	*/
3643	static void sock_def_write_space_wfree(struct sock sk, int* wmem_alloc)
3644	{
3645	/ Do not wake up a writer until he can make "significant"*
3646	* progress. --DaveM
3647	*/
3648	if (__sock_writeable(sk, wmem_alloc)) {
3649	struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3650
3651	/ rely on refcount_sub from sock_wfree() /
3652	smp_mb__after_atomic();
3653	if (wq && waitqueue_active(wq_head: &wq->wait))
3654	wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT \|
3655	EPOLLWRNORM \| EPOLLWRBAND);
3656
3657	/ Should agree with poll, otherwise some programs break /
3658	sk_wake_async_rcu(sk, how: SOCK_WAKE_SPACE, POLL_OUT);
3659	}
3660	}
3661
3662	static void sock_def_destruct(struct sock *sk)
3663	{
3664	}
3665
3666	void sk_send_sigurg(struct sock *sk)
3667	{
3668	if (sk->sk_socket && sk->sk_socket->file)
3669	if (send_sigurg(file: sk->sk_socket->file))
3670	sk_wake_async(sk, how: SOCK_WAKE_URG, POLL_PRI);
3671	}
3672	EXPORT_SYMBOL(sk_send_sigurg);
3673
3674	void sk_reset_timer(struct sock sk, struct* timer_list* timer,
3675	unsigned long expires)
3676	{
3677	if (!mod_timer(timer, expires))
3678	sock_hold(sk);
3679	}
3680	EXPORT_SYMBOL(sk_reset_timer);
3681
3682	void sk_stop_timer(struct sock sk, struct* timer_list* timer)
3683	{
3684	if (timer_delete(timer))
3685	__sock_put(sk);
3686	}
3687	EXPORT_SYMBOL(sk_stop_timer);
3688
3689	void sk_stop_timer_sync(struct sock sk, struct* timer_list *timer)
3690	{
3691	if (timer_delete_sync(timer))
3692	__sock_put(sk);
3693	}
3694	EXPORT_SYMBOL(sk_stop_timer_sync);
3695
3696	void sock_init_data_uid(struct socket sock, struct* sock *sk, kuid_t uid)
3697	{
3698	sk_init_common(sk);
3699	sk->sk_send_head = NULL;
3700
3701	timer_setup(&sk->sk_timer, NULL, `0`);
3702
3703	sk->sk_allocation = GFP_KERNEL;
3704	sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3705	sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3706	sk->sk_state = TCP_CLOSE;
3707	sk->sk_use_task_frag = true;
3708	sk_set_socket(sk, sock);
3709
3710	sock_set_flag(sk, flag: SOCK_ZAPPED);
3711
3712	if (sock) {
3713	sk->sk_type = sock->type;
3714	RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3715	sock->sk = sk;
3716	} else {
3717	RCU_INIT_POINTER(sk->sk_wq, NULL);
3718	}
3719	sk->sk_uid = uid;
3720
3721	sk->sk_state_change = sock_def_wakeup;
3722	sk->sk_data_ready = sock_def_readable;
3723	sk->sk_write_space = sock_def_write_space;
3724	sk->sk_error_report = sock_def_error_report;
3725	sk->sk_destruct = sock_def_destruct;
3726
3727	sk->sk_frag.page = NULL;
3728	sk->sk_frag.offset = `0`;
3729	sk->sk_peek_off = -`1`;
3730
3731	sk->sk_peer_pid = NULL;
3732	sk->sk_peer_cred = NULL;
3733	spin_lock_init(&sk->sk_peer_lock);
3734
3735	sk->sk_write_pending = `0`;
3736	sk->sk_rcvlowat = `1`;
3737	sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3738	sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3739
3740	sk->sk_stamp = SK_DEFAULT_STAMP;
3741	#if BITS_PER_LONG==32
3742	seqlock_init(&sk->sk_stamp_seq);
3743	#endif
3744	atomic_set(v: &sk->sk_zckey, i: `0`);
3745
3746	#ifdef CONFIG_NET_RX_BUSY_POLL
3747	sk->sk_napi_id = `0`;
3748	sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3749	#endif
3750
3751	sk->sk_max_pacing_rate = ~`0UL`;
3752	sk->sk_pacing_rate = ~`0UL`;
3753	WRITE_ONCE(sk->sk_pacing_shift, `10`);
3754	sk->sk_incoming_cpu = -`1`;
3755
3756	sk_rx_queue_clear(sk);
3757	/*
3758	* Before updating sk_refcnt, we must commit prior changes to memory
3759	* (Documentation/RCU/rculist_nulls.rst for details)
3760	*/
3761	smp_wmb();
3762	refcount_set(r: &sk->sk_refcnt, n: `1`);
3763	sk_drops_reset(sk);
3764	}
3765	EXPORT_SYMBOL(sock_init_data_uid);
3766
3767	void sock_init_data(struct socket sock, struct* sock *sk)
3768	{
3769	kuid_t uid = sock ?
3770	SOCK_INODE(socket: sock)->i_uid :
3771	make_kuid(from: sock_net(sk)->user_ns, uid: `0`);
3772
3773	sock_init_data_uid(sock, sk, uid);
3774	}
3775	EXPORT_SYMBOL(sock_init_data);
3776
3777	void lock_sock_nested(struct sock sk, int* subclass)
3778	{
3779	/ The sk_lock has mutex_lock() semantics here. /
3780	mutex_acquire(&sk->sk_lock.dep_map, subclass, `0`, _RET_IP_);
3781
3782	might_sleep();
3783	spin_lock_bh(lock: &sk->sk_lock.slock);
3784	if (sock_owned_by_user_nocheck(sk))
3785	__lock_sock(sk);
3786	sk->sk_lock.owned = `1`;
3787	spin_unlock_bh(lock: &sk->sk_lock.slock);
3788	}
3789	EXPORT_SYMBOL(lock_sock_nested);
3790
3791	void release_sock(struct sock *sk)
3792	{
3793	spin_lock_bh(lock: &sk->sk_lock.slock);
3794	if (sk->sk_backlog.tail)
3795	__release_sock(sk);
3796
3797	if (sk->sk_prot->release_cb)
3798	INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3799	tcp_release_cb, sk);
3800
3801	sock_release_ownership(sk);
3802	if (waitqueue_active(wq_head: &sk->sk_lock.wq))
3803	wake_up(&sk->sk_lock.wq);
3804	spin_unlock_bh(lock: &sk->sk_lock.slock);
3805	}
3806	EXPORT_SYMBOL(release_sock);
3807
3808	bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3809	{
3810	might_sleep();
3811	spin_lock_bh(lock: &sk->sk_lock.slock);
3812
3813	if (!sock_owned_by_user_nocheck(sk)) {
3814	/*
3815	* Fast path return with bottom halves disabled and
3816	* sock::sk_lock.slock held.
3817	*
3818	* The 'mutex' is not contended and holding
3819	* sock::sk_lock.slock prevents all other lockers to
3820	* proceed so the corresponding unlock_sock_fast() can
3821	* avoid the slow path of release_sock() completely and
3822	* just release slock.
3823	*
3824	* From a semantical POV this is equivalent to 'acquiring'
3825	* the 'mutex', hence the corresponding lockdep
3826	* mutex_release() has to happen in the fast path of
3827	* unlock_sock_fast().
3828	*/
3829	return false;
3830	}
3831
3832	__lock_sock(sk);
3833	sk->sk_lock.owned = `1`;
3834	__acquire(&sk->sk_lock.slock);
3835	spin_unlock_bh(lock: &sk->sk_lock.slock);
3836	return true;
3837	}
3838	EXPORT_SYMBOL(__lock_sock_fast);
3839
3840	int sock_gettstamp(struct socket sock, void* __user *userstamp,
3841	bool timeval, bool time32)
3842	{
3843	struct sock *sk = sock->sk;
3844	struct timespec64 ts;
3845
3846	sock_enable_timestamp(sk, flag: SOCK_TIMESTAMP);
3847	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3848	if (ts.tv_sec == -`1`)
3849	return -ENOENT;
3850	if (ts.tv_sec == `0`) {
3851	ktime_t kt = ktime_get_real();
3852	sock_write_timestamp(sk, kt);
3853	ts = ktime_to_timespec64(kt);
3854	}
3855
3856	if (timeval)
3857	ts.tv_nsec /= `1000`;
3858
3859	#ifdef CONFIG_COMPAT_32BIT_TIME
3860	if (time32)
3861	return put_old_timespec32(&ts, userstamp);
3862	#endif
3863	#ifdef CONFIG_SPARC64
3864	/ beware of padding in sparc64 timeval /
3865	if (timeval && !in_compat_syscall()) {
3866	struct __kernel_old_timeval __user tv = {
3867	.tv_sec = ts.tv_sec,
3868	.tv_usec = ts.tv_nsec,
3869	};
3870	if (copy_to_user(userstamp, &tv, sizeof(tv)))
3871	return -EFAULT;
3872	return `0`;
3873	}
3874	#endif
3875	return put_timespec64(ts: &ts, uts: userstamp);
3876	}
3877	EXPORT_SYMBOL(sock_gettstamp);
3878
3879	void sock_enable_timestamp(struct sock sk, enum* sock_flags flag)
3880	{
3881	if (!sock_flag(sk, flag)) {
3882	unsigned long previous_flags = sk->sk_flags;
3883
3884	sock_set_flag(sk, flag);
3885	/*
3886	* we just set one of the two flags which require net
3887	* time stamping, but time stamping might have been on
3888	* already because of the other one
3889	*/
3890	if (sock_needs_netstamp(sk) &&
3891	!(previous_flags & SK_FLAGS_TIMESTAMP))
3892	net_enable_timestamp();
3893	}
3894	}
3895
3896	int sock_recv_errqueue(struct sock sk, struct* msghdr msg, int* len,
3897	int level, int type)
3898	{
3899	struct sock_extended_err ee;
3900	struct sk_buff *skb;
3901	int copied, err;
3902
3903	err = -EAGAIN;
3904	skb = sock_dequeue_err_skb(sk);
3905	if (skb == NULL)
3906	goto out;
3907
3908	copied = skb->len;
3909	if (copied > len) {
3910	msg->msg_flags \|= MSG_TRUNC;
3911	copied = len;
3912	}
3913	err = skb_copy_datagram_msg(from: skb, offset: `0`, msg, size: copied);
3914	if (err)
3915	goto out_free_skb;
3916
3917	sock_recv_timestamp(msg, sk, skb);
3918
3919	/ We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y /
3920	ee = SKB_EXT_ERR(skb)->ee;
3921	put_cmsg(msg, level, type, len: sizeof(ee), data: &ee);
3922
3923	msg->msg_flags \|= MSG_ERRQUEUE;
3924	err = copied;
3925
3926	out_free_skb:
3927	kfree_skb(skb);
3928	out:
3929	return err;
3930	}
3931	EXPORT_SYMBOL(sock_recv_errqueue);
3932
3933	/*
3934	* Get a socket option on an socket.
3935	*
3936	* FIX: POSIX 1003.1g is very ambiguous here. It states that
3937	* asynchronous errors should be reported by getsockopt. We assume
3938	* this means if you specify SO_ERROR (otherwise what is the point of it).
3939	*/
3940	int sock_common_getsockopt(struct socket sock, int* level, int optname,
3941	char __user optval, int* __user *optlen)
3942	{
3943	struct sock *sk = sock->sk;
3944
3945	/ IPV6_ADDRFORM can change sk->sk_prot under us. /
3946	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3947	}
3948	EXPORT_SYMBOL(sock_common_getsockopt);
3949
3950	int sock_common_recvmsg(struct socket sock, struct* msghdr *msg, size_t size,
3951	int flags)
3952	{
3953	struct sock *sk = sock->sk;
3954	int addr_len = `0`;
3955	int err;
3956
3957	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3958	if (err >= `0`)
3959	msg->msg_namelen = addr_len;
3960	return err;
3961	}
3962	EXPORT_SYMBOL(sock_common_recvmsg);
3963
3964	/*
3965	* Set socket options on an inet socket.
3966	*/
3967	int sock_common_setsockopt(struct socket sock, int* level, int optname,
3968	sockptr_t optval, unsigned int optlen)
3969	{
3970	struct sock *sk = sock->sk;
3971
3972	/ IPV6_ADDRFORM can change sk->sk_prot under us. /
3973	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3974	}
3975	EXPORT_SYMBOL(sock_common_setsockopt);
3976
3977	void sk_common_release(struct sock *sk)
3978	{
3979	if (sk->sk_prot->destroy)
3980	sk->sk_prot->destroy(sk);
3981
3982	/*
3983	* Observation: when sk_common_release is called, processes have
3984	* no access to socket. But net still has.
3985	* Step one, detach it from networking:
3986	*
3987	* A. Remove from hash tables.
3988	*/
3989
3990	sk->sk_prot->unhash(sk);
3991
3992	/*
3993	* In this point socket cannot receive new packets, but it is possible
3994	* that some packets are in flight because some CPU runs receiver and
3995	* did hash table lookup before we unhashed socket. They will achieve
3996	* receive queue and will be purged by socket destructor.
3997	*
3998	* Also we still have packets pending on receive queue and probably,
3999	* our own packets waiting in device queues. sock_destroy will drain
4000	* receive queue, but transmitted packets will delay socket destruction
4001	* until the last reference will be released.
4002	*/
4003
4004	sock_orphan(sk);
4005
4006	xfrm_sk_free_policy(sk);
4007
4008	sock_put(sk);
4009	}
4010	EXPORT_SYMBOL(sk_common_release);
4011
4012	void sk_get_meminfo(const struct sock sk, u32 mem)
4013	{
4014	memset(mem, `0`, sizeof(mem) SK_MEMINFO_VARS);
4015
4016	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
4017	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
4018	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
4019	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
4020	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
4021	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
4022	mem[SK_MEMINFO_OPTMEM] = atomic_read(v: &sk->sk_omem_alloc);
4023	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
4024	mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
4025	}
4026
4027	#ifdef CONFIG_PROC_FS
4028	static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4029
4030	int sock_prot_inuse_get(struct net net, struct* proto *prot)
4031	{
4032	int cpu, idx = prot->inuse_idx;
4033	int res = `0`;
4034
4035	for_each_possible_cpu(cpu)
4036	res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4037
4038	return res >= `0` ? res : `0`;
4039	}
4040	EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4041
4042	int sock_inuse_get(struct net *net)
4043	{
4044	int cpu, res = `0`;
4045
4046	for_each_possible_cpu(cpu)
4047	res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4048
4049	return res;
4050	}
4051
4052	EXPORT_SYMBOL_GPL(sock_inuse_get);
4053
4054	static int __net_init sock_inuse_init_net(struct net *net)
4055	{
4056	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4057	if (net->core.prot_inuse == NULL)
4058	return -ENOMEM;
4059	return `0`;
4060	}
4061
4062	static void __net_exit sock_inuse_exit_net(struct net *net)
4063	{
4064	free_percpu(pdata: net->core.prot_inuse);
4065	}
4066
4067	static struct pernet_operations net_inuse_ops = {
4068	.init = sock_inuse_init_net,
4069	.exit = sock_inuse_exit_net,
4070	};
4071
4072	static __init int net_inuse_init(void)
4073	{
4074	if (register_pernet_subsys(&net_inuse_ops))
4075	panic(fmt: "Cannot initialize net inuse counters");
4076
4077	return `0`;
4078	}
4079
4080	core_initcall(net_inuse_init);
4081
4082	static int assign_proto_idx(struct proto *prot)
4083	{
4084	prot->inuse_idx = find_first_zero_bit(addr: proto_inuse_idx, PROTO_INUSE_NR);
4085
4086	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4087	pr_err("PROTO_INUSE_NR exhausted\n");
4088	return -ENOSPC;
4089	}
4090
4091	set_bit(nr: prot->inuse_idx, addr: proto_inuse_idx);
4092	return `0`;
4093	}
4094
4095	static void release_proto_idx(struct proto *prot)
4096	{
4097	if (prot->inuse_idx != PROTO_INUSE_NR)
4098	clear_bit(nr: prot->inuse_idx, addr: proto_inuse_idx);
4099	}
4100	#else
4101	static inline int assign_proto_idx(struct proto *prot)
4102	{
4103	return `0`;
4104	}
4105
4106	static inline void release_proto_idx(struct proto *prot)
4107	{
4108	}
4109
4110	#endif
4111
4112	static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4113	{
4114	if (!twsk_prot)
4115	return;
4116	kfree(objp: twsk_prot->twsk_slab_name);
4117	twsk_prot->twsk_slab_name = NULL;
4118	kmem_cache_destroy(s: twsk_prot->twsk_slab);
4119	twsk_prot->twsk_slab = NULL;
4120	}
4121
4122	static int tw_prot_init(const struct proto *prot)
4123	{
4124	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4125
4126	if (!twsk_prot)
4127	return `0`;
4128
4129	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, fmt: "tw_sock_%s",
4130	prot->name);
4131	if (!twsk_prot->twsk_slab_name)
4132	return -ENOMEM;
4133
4134	twsk_prot->twsk_slab =
4135	kmem_cache_create(twsk_prot->twsk_slab_name,
4136	twsk_prot->twsk_obj_size, `0`,
4137	SLAB_ACCOUNT \| prot->slab_flags,
4138	NULL);
4139	if (!twsk_prot->twsk_slab) {
4140	pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4141	prot->name);
4142	return -ENOMEM;
4143	}
4144
4145	return `0`;
4146	}
4147
4148	static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4149	{
4150	if (!rsk_prot)
4151	return;
4152	kfree(objp: rsk_prot->slab_name);
4153	rsk_prot->slab_name = NULL;
4154	kmem_cache_destroy(s: rsk_prot->slab);
4155	rsk_prot->slab = NULL;
4156	}
4157
4158	static int req_prot_init(const struct proto *prot)
4159	{
4160	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4161
4162	if (!rsk_prot)
4163	return `0`;
4164
4165	rsk_prot->slab_name = kasprintf(GFP_KERNEL, fmt: "request_sock_%s",
4166	prot->name);
4167	if (!rsk_prot->slab_name)
4168	return -ENOMEM;
4169
4170	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4171	rsk_prot->obj_size, `0`,
4172	SLAB_ACCOUNT \| prot->slab_flags,
4173	NULL);
4174
4175	if (!rsk_prot->slab) {
4176	pr_crit("%s: Can't create request sock SLAB cache!\n",
4177	prot->name);
4178	return -ENOMEM;
4179	}
4180	return `0`;
4181	}
4182
4183	int proto_register(struct proto prot, int* alloc_slab)
4184	{
4185	int ret = -ENOBUFS;
4186
4187	if (prot->memory_allocated && !prot->sysctl_mem) {
4188	pr_err("%s: missing sysctl_mem\n", prot->name);
4189	return -EINVAL;
4190	}
4191	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4192	pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4193	return -EINVAL;
4194	}
4195	if (alloc_slab) {
4196	prot->slab = kmem_cache_create_usercopy(name: prot->name,
4197	size: prot->obj_size, align: `0`,
4198	SLAB_HWCACHE_ALIGN \| SLAB_ACCOUNT \|
4199	prot->slab_flags,
4200	useroffset: prot->useroffset, usersize: prot->usersize,
4201	NULL);
4202
4203	if (prot->slab == NULL) {
4204	pr_crit("%s: Can't create sock SLAB cache!\n",
4205	prot->name);
4206	goto out;
4207	}
4208
4209	if (req_prot_init(prot))
4210	goto out_free_request_sock_slab;
4211
4212	if (tw_prot_init(prot))
4213	goto out_free_timewait_sock_slab;
4214	}
4215
4216	mutex_lock(&proto_list_mutex);
4217	ret = assign_proto_idx(prot);
4218	if (ret) {
4219	mutex_unlock(lock: &proto_list_mutex);
4220	goto out_free_timewait_sock_slab;
4221	}
4222	list_add(new: &prot->node, head: &proto_list);
4223	mutex_unlock(lock: &proto_list_mutex);
4224	return ret;
4225
4226	out_free_timewait_sock_slab:
4227	if (alloc_slab)
4228	tw_prot_cleanup(twsk_prot: prot->twsk_prot);
4229	out_free_request_sock_slab:
4230	if (alloc_slab) {
4231	req_prot_cleanup(rsk_prot: prot->rsk_prot);
4232
4233	kmem_cache_destroy(s: prot->slab);
4234	prot->slab = NULL;
4235	}
4236	out:
4237	return ret;
4238	}
4239	EXPORT_SYMBOL(proto_register);
4240
4241	void proto_unregister(struct proto *prot)
4242	{
4243	mutex_lock(&proto_list_mutex);
4244	release_proto_idx(prot);
4245	list_del(entry: &prot->node);
4246	mutex_unlock(lock: &proto_list_mutex);
4247
4248	kmem_cache_destroy(s: prot->slab);
4249	prot->slab = NULL;
4250
4251	req_prot_cleanup(rsk_prot: prot->rsk_prot);
4252	tw_prot_cleanup(twsk_prot: prot->twsk_prot);
4253	}
4254	EXPORT_SYMBOL(proto_unregister);
4255
4256	int sock_load_diag_module(int family, int protocol)
4257	{
4258	if (!protocol) {
4259	if (!sock_is_registered(family))
4260	return -ENOENT;
4261
4262	return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4263	NETLINK_SOCK_DIAG, family);
4264	}
4265
4266	#ifdef CONFIG_INET
4267	if (family == AF_INET &&
4268	protocol != IPPROTO_RAW &&
4269	protocol < MAX_INET_PROTOS &&
4270	!rcu_access_pointer(inet_protos[protocol]))
4271	return -ENOENT;
4272	#endif
4273
4274	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4275	NETLINK_SOCK_DIAG, family, protocol);
4276	}
4277	EXPORT_SYMBOL(sock_load_diag_module);
4278
4279	#ifdef CONFIG_PROC_FS
4280	static void proto_seq_start(struct* seq_file seq, loff_t pos)
4281	__acquires(proto_list_mutex)
4282	{
4283	mutex_lock(&proto_list_mutex);
4284	return seq_list_start_head(head: &proto_list, pos: *pos);
4285	}
4286
4287	static void proto_seq_next(struct* seq_file seq, void* v, loff_t pos)
4288	{
4289	return seq_list_next(v, head: &proto_list, ppos: pos);
4290	}
4291
4292	static void proto_seq_stop(struct seq_file seq, void* *v)
4293	__releases(proto_list_mutex)
4294	{
4295	mutex_unlock(lock: &proto_list_mutex);
4296	}
4297
4298	static char proto_method_implemented(const void *method)
4299	{
4300	return method == NULL ? `'n'` : `'y'`;
4301	}
4302	static long sock_prot_memory_allocated(struct proto *proto)
4303	{
4304	return proto->memory_allocated != NULL ? proto_memory_allocated(prot: proto) : -`1L`;
4305	}
4306
4307	static const char sock_prot_memory_pressure(struct* proto *proto)
4308	{
4309	return proto->memory_pressure != NULL ?
4310	proto_memory_pressure(prot: proto) ? "yes" : "no" : "NI";
4311	}
4312
4313	static void proto_seq_printf(struct seq_file seq, struct* proto *proto)
4314	{
4315
4316	seq_printf(m: seq, fmt: "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4317	"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4318	proto->name,
4319	proto->obj_size,
4320	sock_prot_inuse_get(seq_file_net(seq), proto),
4321	sock_prot_memory_allocated(proto),
4322	sock_prot_memory_pressure(proto),
4323	proto->max_header,
4324	proto->slab == NULL ? "no" : "yes",
4325	module_name(proto->owner),
4326	proto_method_implemented(method: proto->close),
4327	proto_method_implemented(method: proto->connect),
4328	proto_method_implemented(method: proto->disconnect),
4329	proto_method_implemented(method: proto->accept),
4330	proto_method_implemented(method: proto->ioctl),
4331	proto_method_implemented(method: proto->init),
4332	proto_method_implemented(method: proto->destroy),
4333	proto_method_implemented(method: proto->shutdown),
4334	proto_method_implemented(method: proto->setsockopt),
4335	proto_method_implemented(method: proto->getsockopt),
4336	proto_method_implemented(method: proto->sendmsg),
4337	proto_method_implemented(method: proto->recvmsg),
4338	proto_method_implemented(method: proto->bind),
4339	proto_method_implemented(method: proto->backlog_rcv),
4340	proto_method_implemented(method: proto->hash),
4341	proto_method_implemented(method: proto->unhash),
4342	proto_method_implemented(method: proto->get_port),
4343	proto_method_implemented(method: proto->enter_memory_pressure));
4344	}
4345
4346	static int proto_seq_show(struct seq_file seq, void* *v)
4347	{
4348	if (v == &proto_list)
4349	seq_printf(m: seq, fmt: "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4350	"protocol",
4351	"size",
4352	"sockets",
4353	"memory",
4354	"press",
4355	"maxhdr",
4356	"slab",
4357	"module",
4358	"cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4359	else
4360	proto_seq_printf(seq, list_entry(v, struct proto, node));
4361	return `0`;
4362	}
4363
4364	static const struct seq_operations proto_seq_ops = {
4365	.start = proto_seq_start,
4366	.next = proto_seq_next,
4367	.stop = proto_seq_stop,
4368	.show = proto_seq_show,
4369	};
4370
4371	static __net_init int proto_init_net(struct net *net)
4372	{
4373	if (!proc_create_net("protocols", `0444`, net->proc_net, &proto_seq_ops,
4374	sizeof(struct seq_net_private)))
4375	return -ENOMEM;
4376
4377	return `0`;
4378	}
4379
4380	static __net_exit void proto_exit_net(struct net *net)
4381	{
4382	remove_proc_entry("protocols", net->proc_net);
4383	}
4384
4385
4386	static __net_initdata struct pernet_operations proto_net_ops = {
4387	.init = proto_init_net,
4388	.exit = proto_exit_net,
4389	};
4390
4391	static int __init proto_init(void)
4392	{
4393	return register_pernet_subsys(&proto_net_ops);
4394	}
4395
4396	subsys_initcall(proto_init);
4397
4398	#endif /* PROC_FS */
4399
4400	#ifdef CONFIG_NET_RX_BUSY_POLL
4401	bool sk_busy_loop_end(void p, unsigned* long start_time)
4402	{
4403	struct sock *sk = p;
4404
4405	if (!skb_queue_empty_lockless(list: &sk->sk_receive_queue))
4406	return true;
4407
4408	if (sk_is_udp(sk) &&
4409	!skb_queue_empty_lockless(list: &udp_sk(sk)->reader_queue))
4410	return true;
4411
4412	return sk_busy_loop_timeout(sk, start_time);
4413	}
4414	EXPORT_SYMBOL(sk_busy_loop_end);
4415	#endif /* CONFIG_NET_RX_BUSY_POLL */
4416
4417	int sock_bind_add(struct sock sk, struct* sockaddr_unsized addr, int* addr_len)
4418	{
4419	if (!sk->sk_prot->bind_add)
4420	return -EOPNOTSUPP;
4421	return sk->sk_prot->bind_add(sk, addr, addr_len);
4422	}
4423	EXPORT_SYMBOL(sock_bind_add);
4424
4425	/ Copy 'size' bytes from userspace and return `size` back to userspace /
4426	int sock_ioctl_inout(struct sock sk, unsigned* int cmd,
4427	void __user arg, void* *karg, size_t size)
4428	{
4429	int ret;
4430
4431	if (copy_from_user(to: karg, from: arg, n: size))
4432	return -EFAULT;
4433
4434	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4435	if (ret)
4436	return ret;
4437
4438	if (copy_to_user(to: arg, from: karg, n: size))
4439	return -EFAULT;
4440
4441	return `0`;
4442	}
4443	EXPORT_SYMBOL(sock_ioctl_inout);
4444
4445	/ This is the most common ioctl prep function, where the result (4 bytes) is*
4446	* copied back to userspace if the ioctl() returns successfully. No input is
4447	* copied from userspace as input argument.
4448	*/
4449	static int sock_ioctl_out(struct sock sk, unsigned* int cmd, void __user *arg)
4450	{
4451	int ret, karg = `0`;
4452
4453	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4454	if (ret)
4455	return ret;
4456
4457	return put_user(karg, (int __user *)arg);
4458	}
4459
4460	/ A wrapper around sock ioctls, which copies the data from userspace*
4461	* (depending on the protocol/ioctl), and copies back the result to userspace.
4462	* The main motivation for this function is to pass kernel memory to the
4463	* protocol ioctl callbacks, instead of userspace memory.
4464	*/
4465	int sk_ioctl(struct sock sk, unsigned* int cmd, void __user *arg)
4466	{
4467	int rc = `1`;
4468
4469	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4470	rc = ipmr_sk_ioctl(sk, cmd, arg);
4471	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4472	rc = ip6mr_sk_ioctl(sk, cmd, arg);
4473	else if (sk_is_phonet(sk))
4474	rc = phonet_sk_ioctl(sk, cmd, arg);
4475
4476	/ If ioctl was processed, returns its value /
4477	if (rc <= `0`)
4478	return rc;
4479
4480	/ Otherwise call the default handler /
4481	return sock_ioctl_out(sk, cmd, arg);
4482	}
4483	EXPORT_SYMBOL(sk_ioctl);
4484
4485	static int __init sock_struct_check(void)
4486	{
4487	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4488	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4489	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4490	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4491	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4492
4493	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4494	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4495	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4496	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4497	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4498	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4499	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4500	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4501	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4502
4503	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4504	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4505	#ifdef CONFIG_MEMCG
4506	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4507	#endif
4508
4509	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4510	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4511	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4512	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4513
4514	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4515	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4516	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
4517	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4518	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4519	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4520	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4521	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4522	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4523	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4524	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4525	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4526	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4527	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4528
4529	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
4530	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
4531	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4532	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4533	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4534	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4535	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
4536	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
4537	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4538	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4539	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4540	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4541	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4542	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4543	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
4544	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4545	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4546	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4547	return `0`;
4548	}
4549
4550	core_initcall(sock_struct_check);
4551

source code of linux/net/core/sock.c