namei.c source code [linux/fs/namei.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/fs/namei.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	/*
9	* Some corrections by tytso.
10	*/
11
12	/ [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname*
13	* lookup logic.
14	*/
15	/ [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.*
16	*/
17
18	#include <linux/init.h>
19	#include <linux/export.h>
20	#include <linux/slab.h>
21	#include <linux/wordpart.h>
22	#include <linux/fs.h>
23	#include <linux/filelock.h>
24	#include <linux/namei.h>
25	#include <linux/pagemap.h>
26	#include <linux/sched/mm.h>
27	#include <linux/fsnotify.h>
28	#include <linux/personality.h>
29	#include <linux/security.h>
30	#include <linux/syscalls.h>
31	#include <linux/mount.h>
32	#include <linux/audit.h>
33	#include <linux/capability.h>
34	#include <linux/file.h>
35	#include <linux/fcntl.h>
36	#include <linux/device_cgroup.h>
37	#include <linux/fs_struct.h>
38	#include <linux/posix_acl.h>
39	#include <linux/hash.h>
40	#include <linux/bitops.h>
41	#include <linux/init_task.h>
42	#include <linux/uaccess.h>
43
44	#include "internal.h"
45	#include "mount.h"
46
47	/ [Feb-1997 T. Schoebel-Theuer]*
48	* Fundamental changes in the pathname lookup mechanisms (namei)
49	* were necessary because of omirr. The reason is that omirr needs
50	* to know the _real_ pathname, not the user-supplied one, in case
51	* of symlinks (and also when transname replacements occur).
52	*
53	* The new code replaces the old recursive symlink resolution with
54	* an iterative one (in case of non-nested symlink chains). It does
55	* this with calls to <fs>_follow_link().
56	* As a side effect, dir_namei(), _namei() and follow_link() are now
57	* replaced with a single function lookup_dentry() that can handle all
58	* the special cases of the former code.
59	*
60	* With the new dcache, the pathname is stored at each inode, at least as
61	* long as the refcount of the inode is positive. As a side effect, the
62	* size of the dcache depends on the inode cache and thus is dynamic.
63	*
64	* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
65	* resolution to correspond with current state of the code.
66	*
67	* Note that the symlink resolution is not completely iterative.
68	* There is still a significant amount of tail- and mid- recursion in
69	* the algorithm. Also, note that <fs>_readlink() is not used in
70	* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
71	* may return different results than <fs>_follow_link(). Many virtual
72	* filesystems (including /proc) exhibit this behavior.
73	*/
74
75	/ [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:*
76	* New symlink semantics: when open() is called with flags O_CREAT \| O_EXCL
77	* and the name already exists in form of a symlink, try to create the new
78	* name indicated by the symlink. The old code always complained that the
79	* name already exists, due to not following the symlink even if its target
80	* is nonexistent. The new semantics affects also mknod() and link() when
81	* the name is a symlink pointing to a non-existent name.
82	*
83	* I don't know which semantics is the right one, since I have no access
84	* to standards. But I found by trial that HP-UX 9.0 has the full "new"
85	* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
86	* "old" one. Personally, I think the new semantics is much more logical.
87	* Note that "ln old new" where "new" is a symlink pointing to a non-existing
88	* file does succeed in both HP-UX and SunOs, but not in Solaris
89	* and in the old Linux semantics.
90	*/
91
92	/ [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink*
93	* semantics. See the comments in "open_namei" and "do_link" below.
94	*
95	* [10-Sep-98 Alan Modra] Another symlink change.
96	*/
97
98	/ [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:*
99	* inside the path - always follow.
100	* in the last component in creation/removal/renaming - never follow.
101	* if LOOKUP_FOLLOW passed - follow.
102	* if the pathname has trailing slashes - follow.
103	* otherwise - don't follow.
104	* (applied in that order).
105	*
106	* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
107	* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
108	* During the 2.4 we need to fix the userland stuff depending on it -
109	* hopefully we will be able to get rid of that wart in 2.5. So far only
110	* XEmacs seems to be relying on it...
111	*/
112	/*
113	* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
114	* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
115	* any extra contention...
116	*/
117
118	/ In order to reduce some races, while at the same time doing additional*
119	* checking and hopefully speeding things up, we copy filenames to the
120	* kernel data space before using them..
121	*
122	* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
123	* PATH_MAX includes the nul terminator --RR.
124	*/
125
126	#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
127
128	static inline void initname(struct filename name, const* char __user *uptr)
129	{
130	name->uptr = uptr;
131	name->aname = NULL;
132	atomic_set(v: &name->refcnt, i: `1`);
133	}
134
135	struct filename *
136	getname_flags(const char __user filename, int* flags)
137	{
138	struct filename *result;
139	char *kname;
140	int len;
141
142	result = audit_reusename(name: filename);
143	if (result)
144	return result;
145
146	result = __getname();
147	if (unlikely(!result))
148	return ERR_PTR(error: -ENOMEM);
149
150	/*
151	* First, try to embed the struct filename inside the names_cache
152	* allocation
153	*/
154	kname = (char *)result->iname;
155	result->name = kname;
156
157	len = strncpy_from_user(dst: kname, src: filename, EMBEDDED_NAME_MAX);
158	/*
159	* Handle both empty path and copy failure in one go.
160	*/
161	if (unlikely(len <= `0`)) {
162	if (unlikely(len < `0`)) {
163	__putname(result);
164	return ERR_PTR(error: len);
165	}
166
167	/ The empty path is special. /
168	if (!(flags & LOOKUP_EMPTY)) {
169	__putname(result);
170	return ERR_PTR(error: -ENOENT);
171	}
172	}
173
174	/*
175	* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
176	* separate struct filename so we can dedicate the entire
177	* names_cache allocation for the pathname, and re-do the copy from
178	* userland.
179	*/
180	if (unlikely(len == EMBEDDED_NAME_MAX)) {
181	const size_t size = offsetof(struct filename, iname[`1`]);
182	kname = (char *)result;
183
184	/*
185	* size is chosen that way we to guarantee that
186	* result->iname[0] is within the same object and that
187	* kname can't be equal to result->iname, no matter what.
188	*/
189	result = kzalloc(size, GFP_KERNEL);
190	if (unlikely(!result)) {
191	__putname(kname);
192	return ERR_PTR(error: -ENOMEM);
193	}
194	result->name = kname;
195	len = strncpy_from_user(dst: kname, src: filename, PATH_MAX);
196	if (unlikely(len < `0`)) {
197	__putname(kname);
198	kfree(objp: result);
199	return ERR_PTR(error: len);
200	}
201	/ The empty path is special. /
202	if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
203	__putname(kname);
204	kfree(objp: result);
205	return ERR_PTR(error: -ENOENT);
206	}
207	if (unlikely(len == PATH_MAX)) {
208	__putname(kname);
209	kfree(objp: result);
210	return ERR_PTR(error: -ENAMETOOLONG);
211	}
212	}
213	initname(name: result, uptr: filename);
214	audit_getname(name: result);
215	return result;
216	}
217
218	struct filename getname_uflags(const* char __user filename, int* uflags)
219	{
220	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : `0`;
221
222	return getname_flags(filename, flags);
223	}
224
225	struct filename __getname_maybe_null(const* char __user *pathname)
226	{
227	struct filename *name;
228	char c;
229
230	/ try to save on allocations; loss on um, though /
231	if (get_user(c, pathname))
232	return ERR_PTR(error: -EFAULT);
233	if (!c)
234	return NULL;
235
236	name = getname_flags(filename: pathname, LOOKUP_EMPTY);
237	if (!IS_ERR(ptr: name) && !(name->name[`0`])) {
238	putname(name);
239	name = NULL;
240	}
241	return name;
242	}
243
244	struct filename getname_kernel(const* char * filename)
245	{
246	struct filename *result;
247	int len = strlen(filename) + `1`;
248
249	result = __getname();
250	if (unlikely(!result))
251	return ERR_PTR(error: -ENOMEM);
252
253	if (len <= EMBEDDED_NAME_MAX) {
254	result->name = (char *)result->iname;
255	} else if (len <= PATH_MAX) {
256	const size_t size = offsetof(struct filename, iname[`1`]);
257	struct filename *tmp;
258
259	tmp = kmalloc(size, GFP_KERNEL);
260	if (unlikely(!tmp)) {
261	__putname(result);
262	return ERR_PTR(error: -ENOMEM);
263	}
264	tmp->name = (char *)result;
265	result = tmp;
266	} else {
267	__putname(result);
268	return ERR_PTR(error: -ENAMETOOLONG);
269	}
270	memcpy((char *)result->name, filename, len);
271	initname(name: result, NULL);
272	audit_getname(name: result);
273	return result;
274	}
275	EXPORT_SYMBOL(getname_kernel);
276
277	void putname(struct filename *name)
278	{
279	int refcnt;
280
281	if (IS_ERR_OR_NULL(ptr: name))
282	return;
283
284	refcnt = atomic_read(v: &name->refcnt);
285	if (unlikely(refcnt != `1`)) {
286	if (WARN_ON_ONCE(!refcnt))
287	return;
288
289	if (!atomic_dec_and_test(v: &name->refcnt))
290	return;
291	}
292
293	if (unlikely(name->name != name->iname)) {
294	__putname(name->name);
295	kfree(objp: name);
296	} else
297	__putname(name);
298	}
299	EXPORT_SYMBOL(putname);
300
301	/**
302	* check_acl - perform ACL permission checking
303	* @idmap: idmap of the mount the inode was found from
304	* @inode: inode to check permissions on
305	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
306	*
307	* This function performs the ACL permission checking. Since this function
308	* retrieve POSIX acls it needs to know whether it is called from a blocking or
309	* non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
310	*
311	* If the inode has been found through an idmapped mount the idmap of
312	* the vfsmount must be passed through @idmap. This function will then take
313	* care to map the inode according to @idmap before checking permissions.
314	* On non-idmapped mounts or if permission checking is to be performed on the
315	* raw inode simply pass @nop_mnt_idmap.
316	*/
317	static int check_acl(struct mnt_idmap *idmap,
318	struct inode inode, int* mask)
319	{
320	#ifdef CONFIG_FS_POSIX_ACL
321	struct posix_acl *acl;
322
323	if (mask & MAY_NOT_BLOCK) {
324	acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
325	if (!acl)
326	return -EAGAIN;
327	/ no ->get_inode_acl() calls in RCU mode... /
328	if (is_uncached_acl(acl))
329	return -ECHILD;
330	return posix_acl_permission(idmap, inode, acl, mask);
331	}
332
333	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
334	if (IS_ERR(ptr: acl))
335	return PTR_ERR(ptr: acl);
336	if (acl) {
337	int error = posix_acl_permission(idmap, inode, acl, mask);
338	posix_acl_release(acl);
339	return error;
340	}
341	#endif
342
343	return -EAGAIN;
344	}
345
346	/*
347	* Very quick optimistic "we know we have no ACL's" check.
348	*
349	* Note that this is purely for ACL_TYPE_ACCESS, and purely
350	* for the "we have cached that there are no ACLs" case.
351	*
352	* If this returns true, we know there are no ACLs. But if
353	* it returns false, we might still not have ACLs (it could
354	* be the is_uncached_acl() case).
355	*/
356	static inline bool no_acl_inode(struct inode *inode)
357	{
358	#ifdef CONFIG_FS_POSIX_ACL
359	return likely(!READ_ONCE(inode->i_acl));
360	#else
361	return true;
362	#endif
363	}
364
365	/**
366	* acl_permission_check - perform basic UNIX permission checking
367	* @idmap: idmap of the mount the inode was found from
368	* @inode: inode to check permissions on
369	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
370	*
371	* This function performs the basic UNIX permission checking. Since this
372	* function may retrieve POSIX acls it needs to know whether it is called from a
373	* blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
374	*
375	* If the inode has been found through an idmapped mount the idmap of
376	* the vfsmount must be passed through @idmap. This function will then take
377	* care to map the inode according to @idmap before checking permissions.
378	* On non-idmapped mounts or if permission checking is to be performed on the
379	* raw inode simply pass @nop_mnt_idmap.
380	*/
381	static int acl_permission_check(struct mnt_idmap *idmap,
382	struct inode inode, int* mask)
383	{
384	unsigned int mode = inode->i_mode;
385	vfsuid_t vfsuid;
386
387	/*
388	* Common cheap case: everybody has the requested
389	* rights, and there are no ACLs to check. No need
390	* to do any owner/group checks in that case.
391	*
392	* - 'mask&7' is the requested permission bit set
393	* - multiplying by 0111 spreads them out to all of ugo
394	* - '& ~mode' looks for missing inode permission bits
395	* - the '!' is for "no missing permissions"
396	*
397	* After that, we just need to check that there are no
398	* ACL's on the inode - do the 'IS_POSIXACL()' check last
399	* because it will dereference the ->i_sb pointer and we
400	* want to avoid that if at all possible.
401	*/
402	if (!((mask & `7`) * `0111` & ~mode)) {
403	if (no_acl_inode(inode))
404	return `0`;
405	if (!IS_POSIXACL(inode))
406	return `0`;
407	}
408
409	/ Are we the owner? If so, ACL's don't matter /
410	vfsuid = i_uid_into_vfsuid(idmap, inode);
411	if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
412	mask &= `7`;
413	mode >>= `6`;
414	return (mask & ~mode) ? -EACCES : `0`;
415	}
416
417	/ Do we have ACL's? /
418	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
419	int error = check_acl(idmap, inode, mask);
420	if (error != -EAGAIN)
421	return error;
422	}
423
424	/ Only RWX matters for group/other mode bits /
425	mask &= `7`;
426
427	/*
428	* Are the group permissions different from
429	* the other permissions in the bits we care
430	* about? Need to check group ownership if so.
431	*/
432	if (mask & (mode ^ (mode >> `3`))) {
433	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
434	if (vfsgid_in_group_p(vfsgid))
435	mode >>= `3`;
436	}
437
438	/ Bits in 'mode' clear that we require? /
439	return (mask & ~mode) ? -EACCES : `0`;
440	}
441
442	/**
443	* generic_permission - check for access rights on a Posix-like filesystem
444	* @idmap: idmap of the mount the inode was found from
445	* @inode: inode to check access rights for
446	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
447	* %MAY_NOT_BLOCK ...)
448	*
449	* Used to check for read/write/execute permissions on a file.
450	* We use "fsuid" for this, letting us set arbitrary permissions
451	* for filesystem access without changing the "normal" uids which
452	* are used for other things.
453	*
454	* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
455	* request cannot be satisfied (eg. requires blocking or too much complexity).
456	* It would then be called again in ref-walk mode.
457	*
458	* If the inode has been found through an idmapped mount the idmap of
459	* the vfsmount must be passed through @idmap. This function will then take
460	* care to map the inode according to @idmap before checking permissions.
461	* On non-idmapped mounts or if permission checking is to be performed on the
462	* raw inode simply pass @nop_mnt_idmap.
463	*/
464	int generic_permission(struct mnt_idmap idmap, struct* inode *inode,
465	int mask)
466	{
467	int ret;
468
469	/*
470	* Do the basic permission checks.
471	*/
472	ret = acl_permission_check(idmap, inode, mask);
473	if (ret != -EACCES)
474	return ret;
475
476	if (S_ISDIR(inode->i_mode)) {
477	/ DACs are overridable for directories /
478	if (!(mask & MAY_WRITE))
479	if (capable_wrt_inode_uidgid(idmap, inode,
480	CAP_DAC_READ_SEARCH))
481	return `0`;
482	if (capable_wrt_inode_uidgid(idmap, inode,
483	CAP_DAC_OVERRIDE))
484	return `0`;
485	return -EACCES;
486	}
487
488	/*
489	* Searching includes executable on directories, else just read.
490	*/
491	mask &= MAY_READ \| MAY_WRITE \| MAY_EXEC;
492	if (mask == MAY_READ)
493	if (capable_wrt_inode_uidgid(idmap, inode,
494	CAP_DAC_READ_SEARCH))
495	return `0`;
496	/*
497	* Read/write DACs are always overridable.
498	* Executable DACs are overridable when there is
499	* at least one exec bit set.
500	*/
501	if (!(mask & MAY_EXEC) \|\| (inode->i_mode & S_IXUGO))
502	if (capable_wrt_inode_uidgid(idmap, inode,
503	CAP_DAC_OVERRIDE))
504	return `0`;
505
506	return -EACCES;
507	}
508	EXPORT_SYMBOL(generic_permission);
509
510	/**
511	* do_inode_permission - UNIX permission checking
512	* @idmap: idmap of the mount the inode was found from
513	* @inode: inode to check permissions on
514	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
515	*
516	* We _really_ want to just do "generic_permission()" without
517	* even looking at the inode->i_op values. So we keep a cache
518	* flag in inode->i_opflags, that says "this has not special
519	* permission function, use the fast case".
520	*/
521	static inline int do_inode_permission(struct mnt_idmap *idmap,
522	struct inode inode, int* mask)
523	{
524	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
525	if (likely(inode->i_op->permission))
526	return inode->i_op->permission(idmap, inode, mask);
527
528	/ This gets set once for the inode lifetime /
529	spin_lock(lock: &inode->i_lock);
530	inode->i_opflags \|= IOP_FASTPERM;
531	spin_unlock(lock: &inode->i_lock);
532	}
533	return generic_permission(idmap, inode, mask);
534	}
535
536	/**
537	* sb_permission - Check superblock-level permissions
538	* @sb: Superblock of inode to check permission on
539	* @inode: Inode to check permission on
540	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
541	*
542	* Separate out file-system wide checks from inode-specific permission checks.
543	*
544	* Note: lookup_inode_permission_may_exec() does not call here. If you add
545	* MAY_EXEC checks, adjust it.
546	*/
547	static int sb_permission(struct super_block sb, struct* inode inode, int* mask)
548	{
549	if (mask & MAY_WRITE) {
550	umode_t mode = inode->i_mode;
551
552	/ Nobody gets write access to a read-only fs. /
553	if (sb_rdonly(sb) && (S_ISREG(mode) \|\| S_ISDIR(mode) \|\| S_ISLNK(mode)))
554	return -EROFS;
555	}
556	return `0`;
557	}
558
559	/**
560	* inode_permission - Check for access rights to a given inode
561	* @idmap: idmap of the mount the inode was found from
562	* @inode: Inode to check permission on
563	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
564	*
565	* Check for read/write/execute permissions on an inode. We use fs[ug]id for
566	* this, letting us set arbitrary permissions for filesystem access without
567	* changing the "normal" UIDs which are used for other things.
568	*
569	* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
570	*/
571	int inode_permission(struct mnt_idmap *idmap,
572	struct inode inode, int* mask)
573	{
574	int retval;
575
576	retval = sb_permission(sb: inode->i_sb, inode, mask);
577	if (unlikely(retval))
578	return retval;
579
580	if (mask & MAY_WRITE) {
581	/*
582	* Nobody gets write access to an immutable file.
583	*/
584	if (unlikely(IS_IMMUTABLE(inode)))
585	return -EPERM;
586
587	/*
588	* Updating mtime will likely cause i_uid and i_gid to be
589	* written back improperly if their true value is unknown
590	* to the vfs.
591	*/
592	if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
593	return -EACCES;
594	}
595
596	retval = do_inode_permission(idmap, inode, mask);
597	if (unlikely(retval))
598	return retval;
599
600	retval = devcgroup_inode_permission(inode, mask);
601	if (unlikely(retval))
602	return retval;
603
604	return security_inode_permission(inode, mask);
605	}
606	EXPORT_SYMBOL(inode_permission);
607
608	/*
609	* lookup_inode_permission_may_exec - Check traversal right for given inode
610	*
611	* This is a special case routine for may_lookup() making assumptions specific
612	* to path traversal. Use inode_permission() if you are doing something else.
613	*
614	* Work is shaved off compared to inode_permission() as follows:
615	* - we know for a fact there is no MAY_WRITE to worry about
616	* - it is an invariant the inode is a directory
617	*
618	* Since majority of real-world traversal happens on inodes which grant it for
619	* everyone, we check it upfront and only resort to more expensive work if it
620	* fails.
621	*
622	* Filesystems which have their own ->permission hook and consequently miss out
623	* on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
624	* on their directory inodes.
625	*/
626	static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
627	struct inode inode, int* mask)
628	{
629	/ Lookup already checked this to return -ENOTDIR /
630	VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
631	VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != `0`);
632
633	mask \|= MAY_EXEC;
634
635	if (unlikely(!(inode->i_opflags & (IOP_FASTPERM \| IOP_FASTPERM_MAY_EXEC))))
636	return inode_permission(idmap, inode, mask);
637
638	if (unlikely(((inode->i_mode & `0111`) != `0111`) \|\| !no_acl_inode(inode)))
639	return inode_permission(idmap, inode, mask);
640
641	return security_inode_permission(inode, mask);
642	}
643
644	/**
645	* path_get - get a reference to a path
646	* @path: path to get the reference to
647	*
648	* Given a path increment the reference count to the dentry and the vfsmount.
649	*/
650	void path_get(const struct path *path)
651	{
652	mntget(mnt: path->mnt);
653	dget(dentry: path->dentry);
654	}
655	EXPORT_SYMBOL(path_get);
656
657	/**
658	* path_put - put a reference to a path
659	* @path: path to put the reference to
660	*
661	* Given a path decrement the reference count to the dentry and the vfsmount.
662	*/
663	void path_put(const struct path *path)
664	{
665	dput(path->dentry);
666	mntput(mnt: path->mnt);
667	}
668	EXPORT_SYMBOL(path_put);
669
670	#define EMBEDDED_LEVELS 2
671	struct nameidata {
672	struct path path;
673	struct qstr last;
674	struct path root;
675	struct inode inode; /* path.dentry.d_inode /
676	unsigned int flags, state;
677	unsigned seq, next_seq, m_seq, r_seq;
678	int last_type;
679	unsigned depth;
680	int total_link_count;
681	struct saved {
682	struct path link;
683	struct delayed_call done;
684	const char *name;
685	unsigned seq;
686	} *stack, internal[EMBEDDED_LEVELS];
687	struct filename *name;
688	const char *pathname;
689	struct nameidata *saved;
690	unsigned root_seq;
691	int dfd;
692	vfsuid_t dir_vfsuid;
693	umode_t dir_mode;
694	} __randomize_layout;
695
696	#define ND_ROOT_PRESET 1
697	#define ND_ROOT_GRABBED 2
698	#define ND_JUMPED 4
699
700	static void __set_nameidata(struct nameidata p, int* dfd, struct filename *name)
701	{
702	struct nameidata *old = current->nameidata;
703	p->stack = p->internal;
704	p->depth = `0`;
705	p->dfd = dfd;
706	p->name = name;
707	p->pathname = likely(name) ? name->name : "";
708	p->path.mnt = NULL;
709	p->path.dentry = NULL;
710	p->total_link_count = old ? old->total_link_count : `0`;
711	p->saved = old;
712	current->nameidata = p;
713	}
714
715	static inline void set_nameidata(struct nameidata p, int* dfd, struct filename *name,
716	const struct path *root)
717	{
718	__set_nameidata(p, dfd, name);
719	p->state = `0`;
720	if (unlikely(root)) {
721	p->state = ND_ROOT_PRESET;
722	p->root = *root;
723	}
724	}
725
726	static void restore_nameidata(void)
727	{
728	struct nameidata now = current->nameidata, old = now->saved;
729
730	current->nameidata = old;
731	if (old)
732	old->total_link_count = now->total_link_count;
733	if (now->stack != now->internal)
734	kfree(objp: now->stack);
735	}
736
737	static bool nd_alloc_stack(struct nameidata *nd)
738	{
739	struct saved *p;
740
741	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
742	nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
743	if (unlikely(!p))
744	return false;
745	memcpy(p, nd->internal, sizeof(nd->internal));
746	nd->stack = p;
747	return true;
748	}
749
750	/**
751	* path_connected - Verify that a dentry is below mnt.mnt_root
752	* @mnt: The mountpoint to check.
753	* @dentry: The dentry to check.
754	*
755	* Rename can sometimes move a file or directory outside of a bind
756	* mount, path_connected allows those cases to be detected.
757	*/
758	static bool path_connected(struct vfsmount mnt, struct* dentry *dentry)
759	{
760	struct super_block *sb = mnt->mnt_sb;
761
762	/ Bind mounts can have disconnected paths /
763	if (mnt->mnt_root == sb->s_root)
764	return true;
765
766	return is_subdir(dentry, mnt->mnt_root);
767	}
768
769	static void drop_links(struct nameidata *nd)
770	{
771	int i = nd->depth;
772	while (i--) {
773	struct saved *last = nd->stack + i;
774	do_delayed_call(call: &last->done);
775	clear_delayed_call(call: &last->done);
776	}
777	}
778
779	static void leave_rcu(struct nameidata *nd)
780	{
781	nd->flags &= ~LOOKUP_RCU;
782	nd->seq = nd->next_seq = `0`;
783	rcu_read_unlock();
784	}
785
786	static void terminate_walk(struct nameidata *nd)
787	{
788	if (unlikely(nd->depth))
789	drop_links(nd);
790	if (!(nd->flags & LOOKUP_RCU)) {
791	int i;
792	path_put(&nd->path);
793	for (i = `0`; i < nd->depth; i++)
794	path_put(&nd->stack[i].link);
795	if (nd->state & ND_ROOT_GRABBED) {
796	path_put(&nd->root);
797	nd->state &= ~ND_ROOT_GRABBED;
798	}
799	} else {
800	leave_rcu(nd);
801	}
802	nd->depth = `0`;
803	nd->path.mnt = NULL;
804	nd->path.dentry = NULL;
805	}
806
807	/ path_put is needed afterwards regardless of success or failure /
808	static bool __legitimize_path(struct path path, unsigned* seq, unsigned mseq)
809	{
810	int res = __legitimize_mnt(path->mnt, mseq);
811	if (unlikely(res)) {
812	if (res > `0`)
813	path->mnt = NULL;
814	path->dentry = NULL;
815	return false;
816	}
817	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
818	path->dentry = NULL;
819	return false;
820	}
821	return !read_seqcount_retry(&path->dentry->d_seq, seq);
822	}
823
824	static inline bool legitimize_path(struct nameidata *nd,
825	struct path path, unsigned* seq)
826	{
827	return __legitimize_path(path, seq, mseq: nd->m_seq);
828	}
829
830	static bool legitimize_links(struct nameidata *nd)
831	{
832	int i;
833
834	VFS_BUG_ON(nd->flags & LOOKUP_CACHED);
835
836	for (i = `0`; i < nd->depth; i++) {
837	struct saved *last = nd->stack + i;
838	if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
839	drop_links(nd);
840	nd->depth = i + `1`;
841	return false;
842	}
843	}
844	return true;
845	}
846
847	static bool legitimize_root(struct nameidata *nd)
848	{
849	/ Nothing to do if nd->root is zero or is managed by the VFS user. /
850	if (!nd->root.mnt \|\| (nd->state & ND_ROOT_PRESET))
851	return true;
852	nd->state \|= ND_ROOT_GRABBED;
853	return legitimize_path(nd, path: &nd->root, seq: nd->root_seq);
854	}
855
856	/*
857	* Path walking has 2 modes, rcu-walk and ref-walk (see
858	* Documentation/filesystems/path-lookup.txt). In situations when we can't
859	* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
860	* normal reference counts on dentries and vfsmounts to transition to ref-walk
861	* mode. Refcounts are grabbed at the last known good point before rcu-walk
862	* got stuck, so ref-walk may continue from there. If this is not successful
863	* (eg. a seqcount has changed), then failure is returned and it's up to caller
864	* to restart the path walk from the beginning in ref-walk mode.
865	*/
866
867	/**
868	* try_to_unlazy - try to switch to ref-walk mode.
869	* @nd: nameidata pathwalk data
870	* Returns: true on success, false on failure
871	*
872	* try_to_unlazy attempts to legitimize the current nd->path and nd->root
873	* for ref-walk mode.
874	* Must be called from rcu-walk context.
875	* Nothing should touch nameidata between try_to_unlazy() failure and
876	* terminate_walk().
877	*/
878	static bool try_to_unlazy(struct nameidata *nd)
879	{
880	struct dentry *parent = nd->path.dentry;
881
882	BUG_ON(!(nd->flags & LOOKUP_RCU));
883
884	if (unlikely(nd->flags & LOOKUP_CACHED)) {
885	drop_links(nd);
886	nd->depth = `0`;
887	goto out1;
888	}
889	if (unlikely(nd->depth && !legitimize_links(nd)))
890	goto out1;
891	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
892	goto out;
893	if (unlikely(!legitimize_root(nd)))
894	goto out;
895	leave_rcu(nd);
896	BUG_ON(nd->inode != parent->d_inode);
897	return true;
898
899	out1:
900	nd->path.mnt = NULL;
901	nd->path.dentry = NULL;
902	out:
903	leave_rcu(nd);
904	return false;
905	}
906
907	/**
908	* try_to_unlazy_next - try to switch to ref-walk mode.
909	* @nd: nameidata pathwalk data
910	* @dentry: next dentry to step into
911	* Returns: true on success, false on failure
912	*
913	* Similar to try_to_unlazy(), but here we have the next dentry already
914	* picked by rcu-walk and want to legitimize that in addition to the current
915	* nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
916	* Nothing should touch nameidata between try_to_unlazy_next() failure and
917	* terminate_walk().
918	*/
919	static bool try_to_unlazy_next(struct nameidata nd, struct* dentry *dentry)
920	{
921	int res;
922	BUG_ON(!(nd->flags & LOOKUP_RCU));
923
924	if (unlikely(nd->flags & LOOKUP_CACHED)) {
925	drop_links(nd);
926	nd->depth = `0`;
927	goto out2;
928	}
929	if (unlikely(nd->depth && !legitimize_links(nd)))
930	goto out2;
931	res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
932	if (unlikely(res)) {
933	if (res > `0`)
934	goto out2;
935	goto out1;
936	}
937	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
938	goto out1;
939
940	/*
941	* We need to move both the parent and the dentry from the RCU domain
942	* to be properly refcounted. And the sequence number in the dentry
943	* validates both dentry counters, since we checked the sequence
944	* number of the parent after we got the child sequence number. So we
945	* know the parent must still be valid if the child sequence number is
946	*/
947	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
948	goto out;
949	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
950	goto out_dput;
951	/*
952	* Sequence counts matched. Now make sure that the root is
953	* still valid and get it if required.
954	*/
955	if (unlikely(!legitimize_root(nd)))
956	goto out_dput;
957	leave_rcu(nd);
958	return true;
959
960	out2:
961	nd->path.mnt = NULL;
962	out1:
963	nd->path.dentry = NULL;
964	out:
965	leave_rcu(nd);
966	return false;
967	out_dput:
968	leave_rcu(nd);
969	dput(dentry);
970	return false;
971	}
972
973	static inline int d_revalidate(struct inode dir, const* struct qstr *name,
974	struct dentry dentry, unsigned* int flags)
975	{
976	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
977	return dentry->d_op->d_revalidate(dir, name, dentry, flags);
978	else
979	return `1`;
980	}
981
982	/**
983	* complete_walk - successful completion of path walk
984	* @nd: pointer nameidata
985	*
986	* If we had been in RCU mode, drop out of it and legitimize nd->path.
987	* Revalidate the final result, unless we'd already done that during
988	* the path walk or the filesystem doesn't ask for it. Return 0 on
989	* success, -error on failure. In case of failure caller does not
990	* need to drop nd->path.
991	*/
992	static int complete_walk(struct nameidata *nd)
993	{
994	struct dentry *dentry = nd->path.dentry;
995	int status;
996
997	if (nd->flags & LOOKUP_RCU) {
998	/*
999	* We don't want to zero nd->root for scoped-lookups or
1000	* externally-managed nd->root.
1001	*/
1002	if (likely(!(nd->state & ND_ROOT_PRESET)))
1003	if (likely(!(nd->flags & LOOKUP_IS_SCOPED)))
1004	nd->root.mnt = NULL;
1005	nd->flags &= ~LOOKUP_CACHED;
1006	if (!try_to_unlazy(nd))
1007	return -ECHILD;
1008	}
1009
1010	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
1011	/*
1012	* While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
1013	* ever step outside the root during lookup" and should already
1014	* be guaranteed by the rest of namei, we want to avoid a namei
1015	* BUG resulting in userspace being given a path that was not
1016	* scoped within the root at some point during the lookup.
1017	*
1018	* So, do a final sanity-check to make sure that in the
1019	* worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
1020	* we won't silently return an fd completely outside of the
1021	* requested root to userspace.
1022	*
1023	* Userspace could move the path outside the root after this
1024	* check, but as discussed elsewhere this is not a concern (the
1025	* resolved file was inside the root at some point).
1026	*/
1027	if (!path_is_under(&nd->path, &nd->root))
1028	return -EXDEV;
1029	}
1030
1031	if (likely(!(nd->state & ND_JUMPED)))
1032	return `0`;
1033
1034	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
1035	return `0`;
1036
1037	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
1038	if (status > `0`)
1039	return `0`;
1040
1041	if (!status)
1042	status = -ESTALE;
1043
1044	return status;
1045	}
1046
1047	static int set_root(struct nameidata *nd)
1048	{
1049	struct fs_struct *fs = current->fs;
1050
1051	/*
1052	* Jumping to the real root in a scoped-lookup is a BUG in namei, but we
1053	* still have to ensure it doesn't happen because it will cause a breakout
1054	* from the dirfd.
1055	*/
1056	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
1057	return -ENOTRECOVERABLE;
1058
1059	if (nd->flags & LOOKUP_RCU) {
1060	unsigned seq;
1061
1062	do {
1063	seq = read_seqbegin(sl: &fs->seq);
1064	nd->root = fs->root;
1065	nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
1066	} while (read_seqretry(sl: &fs->seq, start: seq));
1067	} else {
1068	get_fs_root(fs, root: &nd->root);
1069	nd->state \|= ND_ROOT_GRABBED;
1070	}
1071	return `0`;
1072	}
1073
1074	static int nd_jump_root(struct nameidata *nd)
1075	{
1076	if (unlikely(nd->flags & LOOKUP_BENEATH))
1077	return -EXDEV;
1078	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1079	/ Absolute path arguments to path_init() are allowed. /
1080	if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
1081	return -EXDEV;
1082	}
1083	if (!nd->root.mnt) {
1084	int error = set_root(nd);
1085	if (unlikely(error))
1086	return error;
1087	}
1088	if (nd->flags & LOOKUP_RCU) {
1089	struct dentry *d;
1090	nd->path = nd->root;
1091	d = nd->path.dentry;
1092	nd->inode = d->d_inode;
1093	nd->seq = nd->root_seq;
1094	if (read_seqcount_retry(&d->d_seq, nd->seq))
1095	return -ECHILD;
1096	} else {
1097	path_put(&nd->path);
1098	nd->path = nd->root;
1099	path_get(&nd->path);
1100	nd->inode = nd->path.dentry->d_inode;
1101	}
1102	nd->state \|= ND_JUMPED;
1103	return `0`;
1104	}
1105
1106	/*
1107	* Helper to directly jump to a known parsed path from ->get_link,
1108	* caller must have taken a reference to path beforehand.
1109	*/
1110	int nd_jump_link(const struct path *path)
1111	{
1112	int error = -ELOOP;
1113	struct nameidata *nd = current->nameidata;
1114
1115	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
1116	goto err;
1117
1118	error = -EXDEV;
1119	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1120	if (nd->path.mnt != path->mnt)
1121	goto err;
1122	}
1123	/ Not currently safe for scoped-lookups. /
1124	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
1125	goto err;
1126
1127	path_put(&nd->path);
1128	nd->path = *path;
1129	nd->inode = nd->path.dentry->d_inode;
1130	nd->state \|= ND_JUMPED;
1131	return `0`;
1132
1133	err:
1134	path_put(path);
1135	return error;
1136	}
1137
1138	static inline void put_link(struct nameidata *nd)
1139	{
1140	struct saved *last = nd->stack + --nd->depth;
1141	do_delayed_call(call: &last->done);
1142	if (!(nd->flags & LOOKUP_RCU))
1143	path_put(&last->link);
1144	}
1145
1146	static int sysctl_protected_symlinks __read_mostly;
1147	static int sysctl_protected_hardlinks __read_mostly;
1148	static int sysctl_protected_fifos __read_mostly;
1149	static int sysctl_protected_regular __read_mostly;
1150
1151	#ifdef CONFIG_SYSCTL
1152	static const struct ctl_table namei_sysctls[] = {
1153	{
1154	.procname = "protected_symlinks",
1155	.data = &sysctl_protected_symlinks,
1156	.maxlen = sizeof(int),
1157	.mode = `0644`,
1158	.proc_handler = proc_dointvec_minmax,
1159	.extra1 = SYSCTL_ZERO,
1160	.extra2 = SYSCTL_ONE,
1161	},
1162	{
1163	.procname = "protected_hardlinks",
1164	.data = &sysctl_protected_hardlinks,
1165	.maxlen = sizeof(int),
1166	.mode = `0644`,
1167	.proc_handler = proc_dointvec_minmax,
1168	.extra1 = SYSCTL_ZERO,
1169	.extra2 = SYSCTL_ONE,
1170	},
1171	{
1172	.procname = "protected_fifos",
1173	.data = &sysctl_protected_fifos,
1174	.maxlen = sizeof(int),
1175	.mode = `0644`,
1176	.proc_handler = proc_dointvec_minmax,
1177	.extra1 = SYSCTL_ZERO,
1178	.extra2 = SYSCTL_TWO,
1179	},
1180	{
1181	.procname = "protected_regular",
1182	.data = &sysctl_protected_regular,
1183	.maxlen = sizeof(int),
1184	.mode = `0644`,
1185	.proc_handler = proc_dointvec_minmax,
1186	.extra1 = SYSCTL_ZERO,
1187	.extra2 = SYSCTL_TWO,
1188	},
1189	};
1190
1191	static int __init init_fs_namei_sysctls(void)
1192	{
1193	register_sysctl_init("fs", namei_sysctls);
1194	return `0`;
1195	}
1196	fs_initcall(init_fs_namei_sysctls);
1197
1198	#endif /* CONFIG_SYSCTL */
1199
1200	/**
1201	* may_follow_link - Check symlink following for unsafe situations
1202	* @nd: nameidata pathwalk data
1203	* @inode: Used for idmapping.
1204	*
1205	* In the case of the sysctl_protected_symlinks sysctl being enabled,
1206	* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1207	* in a sticky world-writable directory. This is to protect privileged
1208	* processes from failing races against path names that may change out
1209	* from under them by way of other users creating malicious symlinks.
1210	* It will permit symlinks to be followed only when outside a sticky
1211	* world-writable directory, or when the uid of the symlink and follower
1212	* match, or when the directory owner matches the symlink's owner.
1213	*
1214	* Returns 0 if following the symlink is allowed, -ve on error.
1215	*/
1216	static inline int may_follow_link(struct nameidata nd, const* struct inode *inode)
1217	{
1218	struct mnt_idmap *idmap;
1219	vfsuid_t vfsuid;
1220
1221	if (!sysctl_protected_symlinks)
1222	return `0`;
1223
1224	idmap = mnt_idmap(mnt: nd->path.mnt);
1225	vfsuid = i_uid_into_vfsuid(idmap, inode);
1226	/ Allowed if owner and follower match. /
1227	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
1228	return `0`;
1229
1230	/ Allowed if parent directory not sticky and world-writable. /
1231	if ((nd->dir_mode & (S_ISVTX\|S_IWOTH)) != (S_ISVTX\|S_IWOTH))
1232	return `0`;
1233
1234	/ Allowed if parent directory and link owner match. /
1235	if (vfsuid_valid(uid: nd->dir_vfsuid) && vfsuid_eq(left: nd->dir_vfsuid, right: vfsuid))
1236	return `0`;
1237
1238	if (nd->flags & LOOKUP_RCU)
1239	return -ECHILD;
1240
1241	audit_inode(name: nd->name, dentry: nd->stack[`0`].link.dentry, aflags: `0`);
1242	audit_log_path_denied(AUDIT_ANOM_LINK, operation: "follow_link");
1243	return -EACCES;
1244	}
1245
1246	/**
1247	* safe_hardlink_source - Check for safe hardlink conditions
1248	* @idmap: idmap of the mount the inode was found from
1249	* @inode: the source inode to hardlink from
1250	*
1251	* Return false if at least one of the following conditions:
1252	* - inode is not a regular file
1253	* - inode is setuid
1254	* - inode is setgid and group-exec
1255	* - access failure for read and write
1256	*
1257	* Otherwise returns true.
1258	*/
1259	static bool safe_hardlink_source(struct mnt_idmap *idmap,
1260	struct inode *inode)
1261	{
1262	umode_t mode = inode->i_mode;
1263
1264	/ Special files should not get pinned to the filesystem. /
1265	if (!S_ISREG(mode))
1266	return false;
1267
1268	/ Setuid files should not get pinned to the filesystem. /
1269	if (mode & S_ISUID)
1270	return false;
1271
1272	/ Executable setgid files should not get pinned to the filesystem. /
1273	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP))
1274	return false;
1275
1276	/ Hardlinking to unreadable or unwritable sources is dangerous. /
1277	if (inode_permission(idmap, inode, MAY_READ \| MAY_WRITE))
1278	return false;
1279
1280	return true;
1281	}
1282
1283	/**
1284	* may_linkat - Check permissions for creating a hardlink
1285	* @idmap: idmap of the mount the inode was found from
1286	* @link: the source to hardlink from
1287	*
1288	* Block hardlink when all of:
1289	* - sysctl_protected_hardlinks enabled
1290	* - fsuid does not match inode
1291	* - hardlink source is unsafe (see safe_hardlink_source() above)
1292	* - not CAP_FOWNER in a namespace with the inode owner uid mapped
1293	*
1294	* If the inode has been found through an idmapped mount the idmap of
1295	* the vfsmount must be passed through @idmap. This function will then take
1296	* care to map the inode according to @idmap before checking permissions.
1297	* On non-idmapped mounts or if permission checking is to be performed on the
1298	* raw inode simply pass @nop_mnt_idmap.
1299	*
1300	* Returns 0 if successful, -ve on error.
1301	*/
1302	int may_linkat(struct mnt_idmap idmap, const* struct path *link)
1303	{
1304	struct inode *inode = link->dentry->d_inode;
1305
1306	/ Inode writeback is not safe when the uid or gid are invalid. /
1307	if (!vfsuid_valid(uid: i_uid_into_vfsuid(idmap, inode)) \|\|
1308	!vfsgid_valid(gid: i_gid_into_vfsgid(idmap, inode)))
1309	return -EOVERFLOW;
1310
1311	if (!sysctl_protected_hardlinks)
1312	return `0`;
1313
1314	/ Source inode owner (or CAP_FOWNER) can hardlink all they like,*
1315	* otherwise, it must be a safe source.
1316	*/
1317	if (safe_hardlink_source(idmap, inode) \|\|
1318	inode_owner_or_capable(idmap, inode))
1319	return `0`;
1320
1321	audit_log_path_denied(AUDIT_ANOM_LINK, operation: "linkat");
1322	return -EPERM;
1323	}
1324
1325	/**
1326	* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1327	* should be allowed, or not, on files that already
1328	* exist.
1329	* @idmap: idmap of the mount the inode was found from
1330	* @nd: nameidata pathwalk data
1331	* @inode: the inode of the file to open
1332	*
1333	* Block an O_CREAT open of a FIFO (or a regular file) when:
1334	* - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1335	* - the file already exists
1336	* - we are in a sticky directory
1337	* - we don't own the file
1338	* - the owner of the directory doesn't own the file
1339	* - the directory is world writable
1340	* If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1341	* the directory doesn't have to be world writable: being group writable will
1342	* be enough.
1343	*
1344	* If the inode has been found through an idmapped mount the idmap of
1345	* the vfsmount must be passed through @idmap. This function will then take
1346	* care to map the inode according to @idmap before checking permissions.
1347	* On non-idmapped mounts or if permission checking is to be performed on the
1348	* raw inode simply pass @nop_mnt_idmap.
1349	*
1350	* Returns 0 if the open is allowed, -ve on error.
1351	*/
1352	static int may_create_in_sticky(struct mnt_idmap idmap, struct* nameidata *nd,
1353	struct inode *const inode)
1354	{
1355	umode_t dir_mode = nd->dir_mode;
1356	vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;
1357
1358	if (likely(!(dir_mode & S_ISVTX)))
1359	return `0`;
1360
1361	if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
1362	return `0`;
1363
1364	if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
1365	return `0`;
1366
1367	i_vfsuid = i_uid_into_vfsuid(idmap, inode);
1368
1369	if (vfsuid_eq(left: i_vfsuid, right: dir_vfsuid))
1370	return `0`;
1371
1372	if (vfsuid_eq_kuid(vfsuid: i_vfsuid, current_fsuid()))
1373	return `0`;
1374
1375	if (likely(dir_mode & `0002`)) {
1376	audit_log_path_denied(AUDIT_ANOM_CREAT, operation: "sticky_create");
1377	return -EACCES;
1378	}
1379
1380	if (dir_mode & `0020`) {
1381	if (sysctl_protected_fifos >= `2` && S_ISFIFO(inode->i_mode)) {
1382	audit_log_path_denied(AUDIT_ANOM_CREAT,
1383	operation: "sticky_create_fifo");
1384	return -EACCES;
1385	}
1386
1387	if (sysctl_protected_regular >= `2` && S_ISREG(inode->i_mode)) {
1388	audit_log_path_denied(AUDIT_ANOM_CREAT,
1389	operation: "sticky_create_regular");
1390	return -EACCES;
1391	}
1392	}
1393
1394	return `0`;
1395	}
1396
1397	/*
1398	* follow_up - Find the mountpoint of path's vfsmount
1399	*
1400	* Given a path, find the mountpoint of its source file system.
1401	* Replace @path with the path of the mountpoint in the parent mount.
1402	* Up is towards /.
1403	*
1404	* Return 1 if we went up a level and 0 if we were already at the
1405	* root.
1406	*/
1407	int follow_up(struct path *path)
1408	{
1409	struct mount *mnt = real_mount(mnt: path->mnt);
1410	struct mount *parent;
1411	struct dentry *mountpoint;
1412
1413	read_seqlock_excl(sl: &mount_lock);
1414	parent = mnt->mnt_parent;
1415	if (parent == mnt) {
1416	read_sequnlock_excl(sl: &mount_lock);
1417	return `0`;
1418	}
1419	mntget(mnt: &parent->mnt);
1420	mountpoint = dget(dentry: mnt->mnt_mountpoint);
1421	read_sequnlock_excl(sl: &mount_lock);
1422	dput(path->dentry);
1423	path->dentry = mountpoint;
1424	mntput(mnt: path->mnt);
1425	path->mnt = &parent->mnt;
1426	return `1`;
1427	}
1428	EXPORT_SYMBOL(follow_up);
1429
1430	static bool choose_mountpoint_rcu(struct mount m, const* struct path *root,
1431	struct path path, unsigned* *seqp)
1432	{
1433	while (mnt_has_parent(mnt: m)) {
1434	struct dentry *mountpoint = m->mnt_mountpoint;
1435
1436	m = m->mnt_parent;
1437	if (unlikely(root->dentry == mountpoint &&
1438	root->mnt == &m->mnt))
1439	break;
1440	if (mountpoint != m->mnt.mnt_root) {
1441	path->mnt = &m->mnt;
1442	path->dentry = mountpoint;
1443	*seqp = read_seqcount_begin(&mountpoint->d_seq);
1444	return true;
1445	}
1446	}
1447	return false;
1448	}
1449
1450	static bool choose_mountpoint(struct mount m, const* struct path *root,
1451	struct path *path)
1452	{
1453	bool found;
1454
1455	rcu_read_lock();
1456	while (`1`) {
1457	unsigned seq, mseq = read_seqbegin(sl: &mount_lock);
1458
1459	found = choose_mountpoint_rcu(m, root, path, seqp: &seq);
1460	if (unlikely(!found)) {
1461	if (!read_seqretry(sl: &mount_lock, start: mseq))
1462	break;
1463	} else {
1464	if (likely(__legitimize_path(path, seq, mseq)))
1465	break;
1466	rcu_read_unlock();
1467	path_put(path);
1468	rcu_read_lock();
1469	}
1470	}
1471	rcu_read_unlock();
1472	return found;
1473	}
1474
1475	/*
1476	* Perform an automount
1477	* - return -EISDIR to tell follow_managed() to stop and return the path we
1478	* were called with.
1479	*/
1480	static int follow_automount(struct path path, int* count, unsigned* lookup_flags)
1481	{
1482	struct dentry *dentry = path->dentry;
1483
1484	/ We don't want to mount if someone's just doing a stat -*
1485	* unless they're stat'ing a directory and appended a '/' to
1486	* the name.
1487	*
1488	* We do, however, want to mount if someone wants to open or
1489	* create a file of any type under the mountpoint, wants to
1490	* traverse through the mountpoint or wants to open the
1491	* mounted directory. Also, autofs may mark negative dentries
1492	* as being automount points. These will need the attentions
1493	* of the daemon to instantiate them before they can be used.
1494	*/
1495	if (!(lookup_flags & (LOOKUP_PARENT \| LOOKUP_DIRECTORY \|
1496	LOOKUP_OPEN \| LOOKUP_CREATE \| LOOKUP_AUTOMOUNT)) &&
1497	dentry->d_inode)
1498	return -EISDIR;
1499
1500	/ No need to trigger automounts if mountpoint crossing is disabled. /
1501	if (lookup_flags & LOOKUP_NO_XDEV)
1502	return -EXDEV;
1503
1504	if (count && (*count)++ >= MAXSYMLINKS)
1505	return -ELOOP;
1506
1507	return finish_automount(dentry->d_op->d_automount(path), path);
1508	}
1509
1510	/*
1511	* mount traversal - out-of-line part. One note on ->d_flags accesses -
1512	* dentries are pinned but not locked here, so negative dentry can go
1513	* positive right under us. Use of smp_load_acquire() provides a barrier
1514	* sufficient for ->d_inode and ->d_flags consistency.
1515	*/
1516	static int __traverse_mounts(struct path path, unsigned* flags, bool *jumped,
1517	int count, unsigned* lookup_flags)
1518	{
1519	struct vfsmount *mnt = path->mnt;
1520	bool need_mntput = false;
1521	int ret = `0`;
1522
1523	while (flags & DCACHE_MANAGED_DENTRY) {
1524	/ Allow the filesystem to manage the transit without i_rwsem*
1525	* being held. */
1526	if (flags & DCACHE_MANAGE_TRANSIT) {
1527	if (lookup_flags & LOOKUP_NO_XDEV) {
1528	ret = -EXDEV;
1529	break;
1530	}
1531	ret = path->dentry->d_op->d_manage(path, false);
1532	flags = smp_load_acquire(&path->dentry->d_flags);
1533	if (ret < `0`)
1534	break;
1535	}
1536
1537	if (flags & DCACHE_MOUNTED) { // something's mounted on it..
1538	struct vfsmount *mounted = lookup_mnt(path);
1539	if (mounted) { // ... in our namespace
1540	dput(path->dentry);
1541	if (need_mntput)
1542	mntput(mnt: path->mnt);
1543	path->mnt = mounted;
1544	path->dentry = dget(dentry: mounted->mnt_root);
1545	// here we know it's positive
1546	flags = path->dentry->d_flags;
1547	need_mntput = true;
1548	if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) {
1549	ret = -EXDEV;
1550	break;
1551	}
1552	continue;
1553	}
1554	}
1555
1556	if (!(flags & DCACHE_NEED_AUTOMOUNT))
1557	break;
1558
1559	// uncovered automount point
1560	ret = follow_automount(path, count, lookup_flags);
1561	flags = smp_load_acquire(&path->dentry->d_flags);
1562	if (ret < `0`)
1563	break;
1564	}
1565
1566	if (ret == -EISDIR)
1567	ret = `0`;
1568	// possible if you race with several mount --move
1569	if (need_mntput && path->mnt == mnt)
1570	mntput(mnt: path->mnt);
1571	if (!ret && unlikely(d_flags_negative(flags)))
1572	ret = -ENOENT;
1573	*jumped = need_mntput;
1574	return ret;
1575	}
1576
1577	static inline int traverse_mounts(struct path path, bool jumped,
1578	int count, unsigned* lookup_flags)
1579	{
1580	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1581
1582	/ fastpath /
1583	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1584	*jumped = false;
1585	if (unlikely(d_flags_negative(flags)))
1586	return -ENOENT;
1587	return `0`;
1588	}
1589	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1590	}
1591
1592	int follow_down_one(struct path *path)
1593	{
1594	struct vfsmount *mounted;
1595
1596	mounted = lookup_mnt(path);
1597	if (mounted) {
1598	dput(path->dentry);
1599	mntput(mnt: path->mnt);
1600	path->mnt = mounted;
1601	path->dentry = dget(dentry: mounted->mnt_root);
1602	return `1`;
1603	}
1604	return `0`;
1605	}
1606	EXPORT_SYMBOL(follow_down_one);
1607
1608	/*
1609	* Follow down to the covering mount currently visible to userspace. At each
1610	* point, the filesystem owning that dentry may be queried as to whether the
1611	* caller is permitted to proceed or not.
1612	*/
1613	int follow_down(struct path path, unsigned* int flags)
1614	{
1615	struct vfsmount *mnt = path->mnt;
1616	bool jumped;
1617	int ret = traverse_mounts(path, jumped: &jumped, NULL, lookup_flags: flags);
1618
1619	if (path->mnt != mnt)
1620	mntput(mnt);
1621	return ret;
1622	}
1623	EXPORT_SYMBOL(follow_down);
1624
1625	/*
1626	* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
1627	* we meet a managed dentry that would need blocking.
1628	*/
1629	static bool __follow_mount_rcu(struct nameidata nd, struct* path *path)
1630	{
1631	struct dentry *dentry = path->dentry;
1632	unsigned int flags = dentry->d_flags;
1633
1634	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1635	return true;
1636
1637	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1638	return false;
1639
1640	for (;;) {
1641	/*
1642	* Don't forget we might have a non-mountpoint managed dentry
1643	* that wants to block transit.
1644	*/
1645	if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1646	int res = dentry->d_op->d_manage(path, true);
1647	if (res)
1648	return res == -EISDIR;
1649	flags = dentry->d_flags;
1650	}
1651
1652	if (flags & DCACHE_MOUNTED) {
1653	struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1654	if (mounted) {
1655	path->mnt = &mounted->mnt;
1656	dentry = path->dentry = mounted->mnt.mnt_root;
1657	nd->state \|= ND_JUMPED;
1658	nd->next_seq = read_seqcount_begin(&dentry->d_seq);
1659	flags = dentry->d_flags;
1660	// makes sure that non-RCU pathwalk could reach
1661	// this state.
1662	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1663	return false;
1664	continue;
1665	}
1666	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1667	return false;
1668	}
1669	return !(flags & DCACHE_NEED_AUTOMOUNT);
1670	}
1671	}
1672
1673	static inline int handle_mounts(struct nameidata nd, struct* dentry *dentry,
1674	struct path *path)
1675	{
1676	bool jumped;
1677	int ret;
1678
1679	path->mnt = nd->path.mnt;
1680	path->dentry = dentry;
1681	if (nd->flags & LOOKUP_RCU) {
1682	unsigned int seq = nd->next_seq;
1683	if (likely(!d_managed(dentry)))
1684	return `0`;
1685	if (likely(__follow_mount_rcu(nd, path)))
1686	return `0`;
1687	// path and nd->next_seq might've been clobbered*
1688	path->mnt = nd->path.mnt;
1689	path->dentry = dentry;
1690	nd->next_seq = seq;
1691	if (unlikely(!try_to_unlazy_next(nd, dentry)))
1692	return -ECHILD;
1693	}
1694	ret = traverse_mounts(path, jumped: &jumped, count: &nd->total_link_count, lookup_flags: nd->flags);
1695	if (jumped)
1696	nd->state \|= ND_JUMPED;
1697	if (unlikely(ret)) {
1698	dput(path->dentry);
1699	if (path->mnt != nd->path.mnt)
1700	mntput(mnt: path->mnt);
1701	}
1702	return ret;
1703	}
1704
1705	/*
1706	* This looks up the name in dcache and possibly revalidates the found dentry.
1707	* NULL is returned if the dentry does not exist in the cache.
1708	*/
1709	static struct dentry lookup_dcache(const* struct qstr *name,
1710	struct dentry *dir,
1711	unsigned int flags)
1712	{
1713	struct dentry *dentry = d_lookup(dir, name);
1714	if (dentry) {
1715	int error = d_revalidate(dir: dir->d_inode, name, dentry, flags);
1716	if (unlikely(error <= `0`)) {
1717	if (!error)
1718	d_invalidate(dentry);
1719	dput(dentry);
1720	return ERR_PTR(error);
1721	}
1722	}
1723	return dentry;
1724	}
1725
1726	/*
1727	* Parent directory has inode locked exclusive. This is one
1728	* and only case when ->lookup() gets called on non in-lookup
1729	* dentries - as the matter of fact, this only gets called
1730	* when directory is guaranteed to have no in-lookup children
1731	* at all.
1732	* Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
1733	* Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
1734	*/
1735	struct dentry lookup_one_qstr_excl(const* struct qstr *name,
1736	struct dentry base, unsigned* int flags)
1737	{
1738	struct dentry *dentry;
1739	struct dentry *old;
1740	struct inode *dir;
1741
1742	dentry = lookup_dcache(name, dir: base, flags);
1743	if (dentry)
1744	goto found;
1745
1746	/ Don't create child dentry for a dead directory. /
1747	dir = base->d_inode;
1748	if (unlikely(IS_DEADDIR(dir)))
1749	return ERR_PTR(error: -ENOENT);
1750
1751	dentry = d_alloc(base, name);
1752	if (unlikely(!dentry))
1753	return ERR_PTR(error: -ENOMEM);
1754
1755	old = dir->i_op->lookup(dir, dentry, flags);
1756	if (unlikely(old)) {
1757	dput(dentry);
1758	dentry = old;
1759	}
1760	found:
1761	if (IS_ERR(ptr: dentry))
1762	return dentry;
1763	if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
1764	dput(dentry);
1765	return ERR_PTR(error: -ENOENT);
1766	}
1767	if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
1768	dput(dentry);
1769	return ERR_PTR(error: -EEXIST);
1770	}
1771	return dentry;
1772	}
1773	EXPORT_SYMBOL(lookup_one_qstr_excl);
1774
1775	/**
1776	* lookup_fast - do fast lockless (but racy) lookup of a dentry
1777	* @nd: current nameidata
1778	*
1779	* Do a fast, but racy lookup in the dcache for the given dentry, and
1780	* revalidate it. Returns a valid dentry pointer or NULL if one wasn't
1781	* found. On error, an ERR_PTR will be returned.
1782	*
1783	* If this function returns a valid dentry and the walk is no longer
1784	* lazy, the dentry will carry a reference that must later be put. If
1785	* RCU mode is still in force, then this is not the case and the dentry
1786	* must be legitimized before use. If this returns NULL, then the walk
1787	* will no longer be in RCU mode.
1788	*/
1789	static struct dentry lookup_fast(struct* nameidata *nd)
1790	{
1791	struct dentry dentry, parent = nd->path.dentry;
1792	int status = `1`;
1793
1794	/*
1795	* Rename seqlock is not required here because in the off chance
1796	* of a false negative due to a concurrent rename, the caller is
1797	* going to fall back to non-racy lookup.
1798	*/
1799	if (nd->flags & LOOKUP_RCU) {
1800	dentry = __d_lookup_rcu(parent, name: &nd->last, seq: &nd->next_seq);
1801	if (unlikely(!dentry)) {
1802	if (!try_to_unlazy(nd))
1803	return ERR_PTR(error: -ECHILD);
1804	return NULL;
1805	}
1806
1807	/*
1808	* This sequence count validates that the parent had no
1809	* changes while we did the lookup of the dentry above.
1810	*/
1811	if (read_seqcount_retry(&parent->d_seq, nd->seq))
1812	return ERR_PTR(error: -ECHILD);
1813
1814	status = d_revalidate(dir: nd->inode, name: &nd->last, dentry, flags: nd->flags);
1815	if (likely(status > `0`))
1816	return dentry;
1817	if (!try_to_unlazy_next(nd, dentry))
1818	return ERR_PTR(error: -ECHILD);
1819	if (status == -ECHILD)
1820	/ we'd been told to redo it in non-rcu mode /
1821	status = d_revalidate(dir: nd->inode, name: &nd->last,
1822	dentry, flags: nd->flags);
1823	} else {
1824	dentry = __d_lookup(parent, &nd->last);
1825	if (unlikely(!dentry))
1826	return NULL;
1827	status = d_revalidate(dir: nd->inode, name: &nd->last, dentry, flags: nd->flags);
1828	}
1829	if (unlikely(status <= `0`)) {
1830	if (!status)
1831	d_invalidate(dentry);
1832	dput(dentry);
1833	return ERR_PTR(error: status);
1834	}
1835	return dentry;
1836	}
1837
1838	/ Fast lookup failed, do it the slow way /
1839	static struct dentry __lookup_slow(const* struct qstr *name,
1840	struct dentry *dir,
1841	unsigned int flags)
1842	{
1843	struct dentry dentry, old;
1844	struct inode *inode = dir->d_inode;
1845	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1846
1847	/ Don't go there if it's already dead /
1848	if (unlikely(IS_DEADDIR(inode)))
1849	return ERR_PTR(error: -ENOENT);
1850	again:
1851	dentry = d_alloc_parallel(dir, name, &wq);
1852	if (IS_ERR(ptr: dentry))
1853	return dentry;
1854	if (unlikely(!d_in_lookup(dentry))) {
1855	int error = d_revalidate(dir: inode, name, dentry, flags);
1856	if (unlikely(error <= `0`)) {
1857	if (!error) {
1858	d_invalidate(dentry);
1859	dput(dentry);
1860	goto again;
1861	}
1862	dput(dentry);
1863	dentry = ERR_PTR(error);
1864	}
1865	} else {
1866	old = inode->i_op->lookup(inode, dentry, flags);
1867	d_lookup_done(dentry);
1868	if (unlikely(old)) {
1869	dput(dentry);
1870	dentry = old;
1871	}
1872	}
1873	return dentry;
1874	}
1875
1876	static noinline struct dentry lookup_slow(const* struct qstr *name,
1877	struct dentry *dir,
1878	unsigned int flags)
1879	{
1880	struct inode *inode = dir->d_inode;
1881	struct dentry *res;
1882	inode_lock_shared(inode);
1883	res = __lookup_slow(name, dir, flags);
1884	inode_unlock_shared(inode);
1885	return res;
1886	}
1887
1888	static struct dentry lookup_slow_killable(const* struct qstr *name,
1889	struct dentry *dir,
1890	unsigned int flags)
1891	{
1892	struct inode *inode = dir->d_inode;
1893	struct dentry *res;
1894
1895	if (inode_lock_shared_killable(inode))
1896	return ERR_PTR(error: -EINTR);
1897	res = __lookup_slow(name, dir, flags);
1898	inode_unlock_shared(inode);
1899	return res;
1900	}
1901
1902	static inline int may_lookup(struct mnt_idmap *idmap,
1903	struct nameidata *restrict nd)
1904	{
1905	int err, mask;
1906
1907	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : `0`;
1908	err = lookup_inode_permission_may_exec(idmap, inode: nd->inode, mask);
1909	if (likely(!err))
1910	return `0`;
1911
1912	// If we failed, and we weren't in LOOKUP_RCU, it's final
1913	if (!(nd->flags & LOOKUP_RCU))
1914	return err;
1915
1916	// Drop out of RCU mode to make sure it wasn't transient
1917	if (!try_to_unlazy(nd))
1918	return -ECHILD; // redo it all non-lazy
1919
1920	if (err != -ECHILD) // hard error
1921	return err;
1922
1923	return lookup_inode_permission_may_exec(idmap, inode: nd->inode, mask: `0`);
1924	}
1925
1926	static int reserve_stack(struct nameidata nd, struct* path *link)
1927	{
1928	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1929	return -ELOOP;
1930
1931	if (likely(nd->depth != EMBEDDED_LEVELS))
1932	return `0`;
1933	if (likely(nd->stack != nd->internal))
1934	return `0`;
1935	if (likely(nd_alloc_stack(nd)))
1936	return `0`;
1937
1938	if (nd->flags & LOOKUP_RCU) {
1939	// we need to grab link before we do unlazy. And we can't skip
1940	// unlazy even if we fail to grab the link - cleanup needs it
1941	bool grabbed_link = legitimize_path(nd, path: link, seq: nd->next_seq);
1942
1943	if (!try_to_unlazy(nd) \|\| !grabbed_link)
1944	return -ECHILD;
1945
1946	if (nd_alloc_stack(nd))
1947	return `0`;
1948	}
1949	return -ENOMEM;
1950	}
1951
1952	enum {WALK_TRAILING = `1`, WALK_MORE = `2`, WALK_NOFOLLOW = `4`};
1953
1954	static noinline const char pick_link(struct* nameidata nd, struct* path *link,
1955	struct inode inode, int* flags)
1956	{
1957	struct saved *last;
1958	const char *res;
1959	int error;
1960
1961	if (nd->flags & LOOKUP_RCU) {
1962	/ make sure that d_is_symlink from step_into_slowpath() matches the inode /
1963	if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
1964	return ERR_PTR(error: -ECHILD);
1965	} else {
1966	if (link->mnt == nd->path.mnt)
1967	mntget(mnt: link->mnt);
1968	}
1969
1970	error = reserve_stack(nd, link);
1971	if (unlikely(error)) {
1972	if (!(nd->flags & LOOKUP_RCU))
1973	path_put(link);
1974	return ERR_PTR(error);
1975	}
1976	last = nd->stack + nd->depth++;
1977	last->link = *link;
1978	clear_delayed_call(call: &last->done);
1979	last->seq = nd->next_seq;
1980
1981	if (flags & WALK_TRAILING) {
1982	error = may_follow_link(nd, inode);
1983	if (unlikely(error))
1984	return ERR_PTR(error);
1985	}
1986
1987	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) \|\|
1988	unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1989	return ERR_PTR(error: -ELOOP);
1990
1991	if (unlikely(atime_needs_update(&last->link, inode))) {
1992	if (nd->flags & LOOKUP_RCU) {
1993	if (!try_to_unlazy(nd))
1994	return ERR_PTR(error: -ECHILD);
1995	}
1996	touch_atime(&last->link);
1997	cond_resched();
1998	}
1999
2000	error = security_inode_follow_link(dentry: link->dentry, inode,
2001	rcu: nd->flags & LOOKUP_RCU);
2002	if (unlikely(error))
2003	return ERR_PTR(error);
2004
2005	res = READ_ONCE(inode->i_link);
2006	if (!res) {
2007	const char * (get)(struct* dentry , struct* inode *,
2008	struct delayed_call *);
2009	get = inode->i_op->get_link;
2010	if (nd->flags & LOOKUP_RCU) {
2011	res = get(NULL, inode, &last->done);
2012	if (res == ERR_PTR(error: -ECHILD) && try_to_unlazy(nd))
2013	res = get(link->dentry, inode, &last->done);
2014	} else {
2015	res = get(link->dentry, inode, &last->done);
2016	}
2017	if (!res)
2018	goto all_done;
2019	if (IS_ERR(ptr: res))
2020	return res;
2021	}
2022	if (*res == `'/'`) {
2023	error = nd_jump_root(nd);
2024	if (unlikely(error))
2025	return ERR_PTR(error);
2026	while (unlikely(*++res == `'/'`))
2027	;
2028	}
2029	if (*res)
2030	return res;
2031	all_done: // pure jump
2032	put_link(nd);
2033	return NULL;
2034	}
2035
2036	/*
2037	* Do we need to follow links? We _really_ want to be able
2038	* to do this check without having to look at inode->i_op,
2039	* so we keep a cache of "no, this doesn't need follow_link"
2040	* for the common case.
2041	*
2042	* NOTE: dentry must be what nd->next_seq had been sampled from.
2043	*/
2044	static noinline const char step_into_slowpath(struct* nameidata nd, int* flags,
2045	struct dentry *dentry)
2046	{
2047	struct path path;
2048	struct inode *inode;
2049	int err;
2050
2051	err = handle_mounts(nd, dentry, path: &path);
2052	if (unlikely(err < `0`))
2053	return ERR_PTR(error: err);
2054	inode = path.dentry->d_inode;
2055	if (likely(!d_is_symlink(path.dentry)) \|\|
2056	((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) \|\|
2057	(flags & WALK_NOFOLLOW)) {
2058	/ not a symlink or should not follow /
2059	if (nd->flags & LOOKUP_RCU) {
2060	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
2061	return ERR_PTR(error: -ECHILD);
2062	if (unlikely(!inode))
2063	return ERR_PTR(error: -ENOENT);
2064	} else {
2065	dput(nd->path.dentry);
2066	if (nd->path.mnt != path.mnt)
2067	mntput(mnt: nd->path.mnt);
2068	}
2069	nd->path = path;
2070	nd->inode = inode;
2071	nd->seq = nd->next_seq;
2072	return NULL;
2073	}
2074	return pick_link(nd, link: &path, inode, flags);
2075	}
2076
2077	static __always_inline const char step_into(struct* nameidata nd, int* flags,
2078	struct dentry *dentry)
2079	{
2080	/*
2081	* In the common case we are in rcu-walk and traversing over a non-mounted on
2082	* directory (as opposed to e.g., a symlink).
2083	*
2084	* We can handle that and negative entries with the checks below.
2085	*/
2086	if (likely((nd->flags & LOOKUP_RCU) &&
2087	!d_managed(dentry) && !d_is_symlink(dentry))) {
2088	struct inode *inode = dentry->d_inode;
2089	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
2090	return ERR_PTR(error: -ECHILD);
2091	if (unlikely(!inode))
2092	return ERR_PTR(error: -ENOENT);
2093	nd->path.dentry = dentry;
2094	/ nd->path.mnt is retained on purpose /
2095	nd->inode = inode;
2096	nd->seq = nd->next_seq;
2097	return NULL;
2098	}
2099	return step_into_slowpath(nd, flags, dentry);
2100	}
2101
2102	static struct dentry follow_dotdot_rcu(struct* nameidata *nd)
2103	{
2104	struct dentry parent, old;
2105
2106	if (path_equal(path1: &nd->path, path2: &nd->root))
2107	goto in_root;
2108	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2109	struct path path;
2110	unsigned seq;
2111	if (!choose_mountpoint_rcu(m: real_mount(mnt: nd->path.mnt),
2112	root: &nd->root, path: &path, seqp: &seq))
2113	goto in_root;
2114	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2115	return ERR_PTR(error: -ECHILD);
2116	nd->path = path;
2117	nd->inode = path.dentry->d_inode;
2118	nd->seq = seq;
2119	// makes sure that non-RCU pathwalk could reach this state
2120	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
2121	return ERR_PTR(error: -ECHILD);
2122	/ we know that mountpoint was pinned /
2123	}
2124	old = nd->path.dentry;
2125	parent = old->d_parent;
2126	nd->next_seq = read_seqcount_begin(&parent->d_seq);
2127	// makes sure that non-RCU pathwalk could reach this state
2128	if (read_seqcount_retry(&old->d_seq, nd->seq))
2129	return ERR_PTR(error: -ECHILD);
2130	if (unlikely(!path_connected(nd->path.mnt, parent)))
2131	return ERR_PTR(error: -ECHILD);
2132	return parent;
2133	in_root:
2134	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
2135	return ERR_PTR(error: -ECHILD);
2136	if (unlikely(nd->flags & LOOKUP_BENEATH))
2137	return ERR_PTR(error: -ECHILD);
2138	nd->next_seq = nd->seq;
2139	return nd->path.dentry;
2140	}
2141
2142	static struct dentry follow_dotdot(struct* nameidata *nd)
2143	{
2144	struct dentry *parent;
2145
2146	if (path_equal(path1: &nd->path, path2: &nd->root))
2147	goto in_root;
2148	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2149	struct path path;
2150
2151	if (!choose_mountpoint(m: real_mount(mnt: nd->path.mnt),
2152	root: &nd->root, path: &path))
2153	goto in_root;
2154	path_put(&nd->path);
2155	nd->path = path;
2156	nd->inode = path.dentry->d_inode;
2157	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2158	return ERR_PTR(error: -EXDEV);
2159	}
2160	/ rare case of legitimate dget_parent()... /
2161	parent = dget_parent(dentry: nd->path.dentry);
2162	if (unlikely(!path_connected(nd->path.mnt, parent))) {
2163	dput(parent);
2164	return ERR_PTR(error: -ENOENT);
2165	}
2166	return parent;
2167
2168	in_root:
2169	if (unlikely(nd->flags & LOOKUP_BENEATH))
2170	return ERR_PTR(error: -EXDEV);
2171	return dget(dentry: nd->path.dentry);
2172	}
2173
2174	static const char handle_dots(struct* nameidata nd, int* type)
2175	{
2176	if (type == LAST_DOTDOT) {
2177	const char *error = NULL;
2178	struct dentry *parent;
2179
2180	if (!nd->root.mnt) {
2181	error = ERR_PTR(error: set_root(nd));
2182	if (unlikely(error))
2183	return error;
2184	}
2185	if (nd->flags & LOOKUP_RCU)
2186	parent = follow_dotdot_rcu(nd);
2187	else
2188	parent = follow_dotdot(nd);
2189	if (IS_ERR(ptr: parent))
2190	return ERR_CAST(ptr: parent);
2191	error = step_into(nd, flags: WALK_NOFOLLOW, dentry: parent);
2192	if (unlikely(error))
2193	return error;
2194
2195	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
2196	/*
2197	* If there was a racing rename or mount along our
2198	* path, then we can't be sure that ".." hasn't jumped
2199	* above nd->root (and so userspace should retry or use
2200	* some fallback).
2201	*/
2202	smp_rmb();
2203	if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
2204	return ERR_PTR(error: -EAGAIN);
2205	if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
2206	return ERR_PTR(error: -EAGAIN);
2207	}
2208	}
2209	return NULL;
2210	}
2211
2212	static __always_inline const char walk_component(struct* nameidata nd, int* flags)
2213	{
2214	struct dentry *dentry;
2215	/*
2216	* "." and ".." are special - ".." especially so because it has
2217	* to be able to know about the current root directory and
2218	* parent relationships.
2219	*/
2220	if (unlikely(nd->last_type != LAST_NORM)) {
2221	if (unlikely(nd->depth) && !(flags & WALK_MORE))
2222	put_link(nd);
2223	return handle_dots(nd, type: nd->last_type);
2224	}
2225	dentry = lookup_fast(nd);
2226	if (IS_ERR(ptr: dentry))
2227	return ERR_CAST(ptr: dentry);
2228	if (unlikely(!dentry)) {
2229	dentry = lookup_slow(name: &nd->last, dir: nd->path.dentry, flags: nd->flags);
2230	if (IS_ERR(ptr: dentry))
2231	return ERR_CAST(ptr: dentry);
2232	}
2233	if (unlikely(nd->depth) && !(flags & WALK_MORE))
2234	put_link(nd);
2235	return step_into(nd, flags, dentry);
2236	}
2237
2238	/*
2239	* We can do the critical dentry name comparison and hashing
2240	* operations one word at a time, but we are limited to:
2241	*
2242	* - Architectures with fast unaligned word accesses. We could
2243	* do a "get_unaligned()" if this helps and is sufficiently
2244	* fast.
2245	*
2246	* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
2247	* do not trap on the (extremely unlikely) case of a page
2248	* crossing operation.
2249	*
2250	* - Furthermore, we need an efficient 64-bit compile for the
2251	* 64-bit case in order to generate the "number of bytes in
2252	* the final mask". Again, that could be replaced with a
2253	* efficient population count instruction or similar.
2254	*/
2255	#ifdef CONFIG_DCACHE_WORD_ACCESS
2256
2257	#include <asm/word-at-a-time.h>
2258
2259	#ifdef HASH_MIX
2260
2261	/ Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> /
2262
2263	#elif defined(CONFIG_64BIT)
2264	/*
2265	* Register pressure in the mixing function is an issue, particularly
2266	* on 32-bit x86, but almost any function requires one state value and
2267	* one temporary. Instead, use a function designed for two state values
2268	* and no temporaries.
2269	*
2270	* This function cannot create a collision in only two iterations, so
2271	* we have two iterations to achieve avalanche. In those two iterations,
2272	* we have six layers of mixing, which is enough to spread one bit's
2273	* influence out to 2^6 = 64 state bits.
2274	*
2275	* Rotate constants are scored by considering either 64 one-bit input
2276	* deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2277	* probability of that delta causing a change to each of the 128 output
2278	* bits, using a sample of random initial states.
2279	*
2280	* The Shannon entropy of the computed probabilities is then summed
2281	* to produce a score. Ideally, any input change has a 50% chance of
2282	* toggling any given output bit.
2283	*
2284	* Mixing scores (in bits) for (12,45):
2285	* Input delta: 1-bit 2-bit
2286	* 1 round: 713.3 42542.6
2287	* 2 rounds: 2753.7 140389.8
2288	* 3 rounds: 5954.1 233458.2
2289	* 4 rounds: 7862.6 256672.2
2290	* Perfect: 8192 258048
2291	* (64128) (6463/2 * 128)
2292	*/
2293	#define HASH_MIX(x, y, a) \
2294	( x ^= (a), \
2295	y ^= x, x = rol64(x,12),\
2296	x += y, y = rol64(y,45),\
2297	y *= 9 )
2298
2299	/*
2300	* Fold two longs into one 32-bit hash value. This must be fast, but
2301	* latency isn't quite as critical, as there is a fair bit of additional
2302	* work done before the hash value is used.
2303	*/
2304	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2305	{
2306	y ^= x * GOLDEN_RATIO_64;
2307	y *= GOLDEN_RATIO_64;
2308	return y >> `32`;
2309	}
2310
2311	#else /* 32-bit case */
2312
2313	/*
2314	* Mixing scores (in bits) for (7,20):
2315	* Input delta: 1-bit 2-bit
2316	* 1 round: 330.3 9201.6
2317	* 2 rounds: 1246.4 25475.4
2318	* 3 rounds: 1907.1 31295.1
2319	* 4 rounds: 2042.3 31718.6
2320	* Perfect: 2048 31744
2321	* (3264) (3231/2 * 64)
2322	*/
2323	#define HASH_MIX(x, y, a) \
2324	( x ^= (a), \
2325	y ^= x, x = rol32(x, 7),\
2326	x += y, y = rol32(y,20),\
2327	y *= 9 )
2328
2329	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2330	{
2331	/ Use arch-optimized multiply if one exists /
2332	return __hash_32(y ^ __hash_32(x));
2333	}
2334
2335	#endif
2336
2337	/*
2338	* Return the hash of a string of known length. This is carfully
2339	* designed to match hash_name(), which is the more critical function.
2340	* In particular, we must end by hashing a final word containing 0..7
2341	* payload bytes, to match the way that hash_name() iterates until it
2342	* finds the delimiter after the name.
2343	*/
2344	unsigned int full_name_hash(const void salt, const* char name, unsigned* int len)
2345	{
2346	unsigned long a, x = `0`, y = (unsigned long)salt;
2347
2348	for (;;) {
2349	if (!len)
2350	goto done;
2351	a = load_unaligned_zeropad(addr: name);
2352	if (len < sizeof(unsigned long))
2353	break;
2354	HASH_MIX(x, y, a);
2355	name += sizeof(unsigned long);
2356	len -= sizeof(unsigned long);
2357	}
2358	x ^= a & bytemask_from_count(len);
2359	done:
2360	return fold_hash(x, y);
2361	}
2362	EXPORT_SYMBOL(full_name_hash);
2363
2364	/ Return the "hash_len" (hash and length) of a null-terminated string /
2365	u64 hashlen_string(const void salt, const* char *name)
2366	{
2367	unsigned long a = `0`, x = `0`, y = (unsigned long)salt;
2368	unsigned long adata, mask, len;
2369	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2370
2371	len = `0`;
2372	goto inside;
2373
2374	do {
2375	HASH_MIX(x, y, a);
2376	len += sizeof(unsigned long);
2377	inside:
2378	a = load_unaligned_zeropad(addr: name+len);
2379	} while (!has_zero(a, bits: &adata, c: &constants));
2380
2381	adata = prep_zero_mask(a, bits: adata, c: &constants);
2382	mask = create_zero_mask(adata);
2383	x ^= a & zero_bytemask(bits: mask);
2384
2385	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2386	}
2387	EXPORT_SYMBOL(hashlen_string);
2388
2389	/*
2390	* Calculate the length and hash of the path component, and
2391	* return the length as the result.
2392	*/
2393	static inline const char hash_name(struct* nameidata *nd,
2394	const char *name,
2395	unsigned long *lastword)
2396	{
2397	unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
2398	unsigned long adata, bdata, mask, len;
2399	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2400
2401	/*
2402	* The first iteration is special, because it can result in
2403	* '.' and '..' and has no mixing other than the final fold.
2404	*/
2405	a = load_unaligned_zeropad(addr: name);
2406	b = a ^ REPEAT_BYTE(`'/'`);
2407	if (has_zero(a, bits: &adata, c: &constants) \| has_zero(a: b, bits: &bdata, c: &constants)) {
2408	adata = prep_zero_mask(a, bits: adata, c: &constants);
2409	bdata = prep_zero_mask(a: b, bits: bdata, c: &constants);
2410	mask = create_zero_mask(adata \| bdata);
2411	a &= zero_bytemask(bits: mask);
2412	*lastword = a;
2413	len = find_zero(mask);
2414	nd->last.hash = fold_hash(x: a, y);
2415	nd->last.len = len;
2416	return name + len;
2417	}
2418
2419	len = `0`;
2420	x = `0`;
2421	do {
2422	HASH_MIX(x, y, a);
2423	len += sizeof(unsigned long);
2424	a = load_unaligned_zeropad(addr: name+len);
2425	b = a ^ REPEAT_BYTE(`'/'`);
2426	} while (!(has_zero(a, bits: &adata, c: &constants) \| has_zero(a: b, bits: &bdata, c: &constants)));
2427
2428	adata = prep_zero_mask(a, bits: adata, c: &constants);
2429	bdata = prep_zero_mask(a: b, bits: bdata, c: &constants);
2430	mask = create_zero_mask(adata \| bdata);
2431	a &= zero_bytemask(bits: mask);
2432	x ^= a;
2433	len += find_zero(mask);
2434	lastword = `0`; // Multi-word components cannot be DOT or DOTDOT*
2435
2436	nd->last.hash = fold_hash(x, y);
2437	nd->last.len = len;
2438	return name + len;
2439	}
2440
2441	/*
2442	* Note that the 'last' word is always zero-masked, but
2443	* was loaded as a possibly big-endian word.
2444	*/
2445	#ifdef __BIG_ENDIAN
2446	#define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8))
2447	#define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16))
2448	#endif
2449
2450	#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2451
2452	/ Return the hash of a string of known length /
2453	unsigned int full_name_hash(const void salt, const* char name, unsigned* int len)
2454	{
2455	unsigned long hash = init_name_hash(salt);
2456	while (len--)
2457	hash = partial_name_hash((unsigned char)*name++, hash);
2458	return end_name_hash(hash);
2459	}
2460	EXPORT_SYMBOL(full_name_hash);
2461
2462	/ Return the "hash_len" (hash and length) of a null-terminated string /
2463	u64 hashlen_string(const void salt, const* char *name)
2464	{
2465	unsigned long hash = init_name_hash(salt);
2466	unsigned long len = `0`, c;
2467
2468	c = (unsigned char)*name;
2469	while (c) {
2470	len++;
2471	hash = partial_name_hash(c, hash);
2472	c = (unsigned char)name[len];
2473	}
2474	return hashlen_create(end_name_hash(hash), len);
2475	}
2476	EXPORT_SYMBOL(hashlen_string);
2477
2478	/*
2479	* We know there's a real path component here of at least
2480	* one character.
2481	*/
2482	static inline const char hash_name(struct* nameidata nd, const* char name, unsigned* long *lastword)
2483	{
2484	unsigned long hash = init_name_hash(nd->path.dentry);
2485	unsigned long len = `0`, c, last = `0`;
2486
2487	c = (unsigned char)*name;
2488	do {
2489	last = (last << `8`) + c;
2490	len++;
2491	hash = partial_name_hash(c, hash);
2492	c = (unsigned char)name[len];
2493	} while (c && c != `'/'`);
2494
2495	// This is reliable for DOT or DOTDOT, since the component
2496	// cannot contain NUL characters - top bits being zero means
2497	// we cannot have had any other pathnames.
2498	*lastword = last;
2499	nd->last.hash = end_name_hash(hash);
2500	nd->last.len = len;
2501	return name + len;
2502	}
2503
2504	#endif
2505
2506	#ifndef LAST_WORD_IS_DOT
2507	#define LAST_WORD_IS_DOT 0x2e
2508	#define LAST_WORD_IS_DOTDOT 0x2e2e
2509	#endif
2510
2511	/*
2512	* Name resolution.
2513	* This is the basic name resolution function, turning a pathname into
2514	* the final dentry. We expect 'base' to be positive and a directory.
2515	*
2516	* Returns 0 and nd will have valid dentry and mnt on success.
2517	* Returns error and drops reference to input namei data on failure.
2518	*/
2519	static int link_path_walk(const char name, struct* nameidata *nd)
2520	{
2521	int depth = `0`; // depth <= nd->depth
2522	int err;
2523
2524	nd->last_type = LAST_ROOT;
2525	nd->flags \|= LOOKUP_PARENT;
2526	if (IS_ERR(ptr: name))
2527	return PTR_ERR(ptr: name);
2528	if (*name == `'/'`) {
2529	do {
2530	name++;
2531	} while (unlikely(*name == `'/'`));
2532	}
2533	if (unlikely(!*name)) {
2534	nd->dir_mode = `0`; // short-circuit the 'hardening' idiocy
2535	return `0`;
2536	}
2537
2538	/ At this point we know we have a real path component. /
2539	for(;;) {
2540	struct mnt_idmap *idmap;
2541	const char *link;
2542	unsigned long lastword;
2543
2544	idmap = mnt_idmap(mnt: nd->path.mnt);
2545	err = may_lookup(idmap, nd);
2546	if (unlikely(err))
2547	return err;
2548
2549	nd->last.name = name;
2550	name = hash_name(nd, name, lastword: &lastword);
2551
2552	switch(lastword) {
2553	case LAST_WORD_IS_DOTDOT:
2554	nd->last_type = LAST_DOTDOT;
2555	nd->state \|= ND_JUMPED;
2556	break;
2557
2558	case LAST_WORD_IS_DOT:
2559	nd->last_type = LAST_DOT;
2560	break;
2561
2562	default:
2563	nd->last_type = LAST_NORM;
2564	nd->state &= ~ND_JUMPED;
2565
2566	struct dentry *parent = nd->path.dentry;
2567	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2568	err = parent->d_op->d_hash(parent, &nd->last);
2569	if (err < `0`)
2570	return err;
2571	}
2572	}
2573
2574	if (!*name)
2575	goto OK;
2576	/*
2577	* If it wasn't NUL, we know it was '/'. Skip that
2578	* slash, and continue until no more slashes.
2579	*/
2580	do {
2581	name++;
2582	} while (unlikely(*name == `'/'`));
2583	if (unlikely(!*name)) {
2584	OK:
2585	/ pathname or trailing symlink, done /
2586	if (likely(!depth)) {
2587	nd->dir_vfsuid = i_uid_into_vfsuid(idmap, inode: nd->inode);
2588	nd->dir_mode = nd->inode->i_mode;
2589	nd->flags &= ~LOOKUP_PARENT;
2590	return `0`;
2591	}
2592	/ last component of nested symlink /
2593	name = nd->stack[--depth].name;
2594	link = walk_component(nd, flags: `0`);
2595	} else {
2596	/ not the last component /
2597	link = walk_component(nd, flags: WALK_MORE);
2598	}
2599	if (unlikely(link)) {
2600	if (IS_ERR(ptr: link))
2601	return PTR_ERR(ptr: link);
2602	/ a symlink to follow /
2603	nd->stack[depth++].name = name;
2604	name = link;
2605	continue;
2606	}
2607	if (unlikely(!d_can_lookup(nd->path.dentry))) {
2608	if (nd->flags & LOOKUP_RCU) {
2609	if (!try_to_unlazy(nd))
2610	return -ECHILD;
2611	}
2612	return -ENOTDIR;
2613	}
2614	}
2615	}
2616
2617	/ must be paired with terminate_walk() /
2618	static const char path_init(struct* nameidata nd, unsigned* flags)
2619	{
2620	int error;
2621	const char *s = nd->pathname;
2622
2623	/ LOOKUP_CACHED requires RCU, ask caller to retry /
2624	if (unlikely((flags & (LOOKUP_RCU \| LOOKUP_CACHED)) == LOOKUP_CACHED))
2625	return ERR_PTR(error: -EAGAIN);
2626
2627	if (unlikely(!*s))
2628	flags &= ~LOOKUP_RCU;
2629	if (flags & LOOKUP_RCU)
2630	rcu_read_lock();
2631	else
2632	nd->seq = nd->next_seq = `0`;
2633
2634	nd->flags = flags;
2635	nd->state \|= ND_JUMPED;
2636
2637	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2638	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2639	smp_rmb();
2640
2641	if (unlikely(nd->state & ND_ROOT_PRESET)) {
2642	struct dentry *root = nd->root.dentry;
2643	struct inode *inode = root->d_inode;
2644	if (*s && unlikely(!d_can_lookup(root)))
2645	return ERR_PTR(error: -ENOTDIR);
2646	nd->path = nd->root;
2647	nd->inode = inode;
2648	if (flags & LOOKUP_RCU) {
2649	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2650	nd->root_seq = nd->seq;
2651	} else {
2652	path_get(&nd->path);
2653	}
2654	return s;
2655	}
2656
2657	nd->root.mnt = NULL;
2658
2659	/ Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). /
2660	if (*s == `'/'` && likely(!(flags & LOOKUP_IN_ROOT))) {
2661	error = nd_jump_root(nd);
2662	if (unlikely(error))
2663	return ERR_PTR(error);
2664	return s;
2665	}
2666
2667	/ Relative pathname -- get the starting-point it is relative to. /
2668	if (nd->dfd == AT_FDCWD) {
2669	if (flags & LOOKUP_RCU) {
2670	struct fs_struct *fs = current->fs;
2671	unsigned seq;
2672
2673	do {
2674	seq = read_seqbegin(sl: &fs->seq);
2675	nd->path = fs->pwd;
2676	nd->inode = nd->path.dentry->d_inode;
2677	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2678	} while (read_seqretry(sl: &fs->seq, start: seq));
2679	} else {
2680	get_fs_pwd(current->fs, pwd: &nd->path);
2681	nd->inode = nd->path.dentry->d_inode;
2682	}
2683	} else {
2684	/ Caller must check execute permissions on the starting path component /
2685	CLASS(fd_raw, f)(fd: nd->dfd);
2686	struct dentry *dentry;
2687
2688	if (fd_empty(f))
2689	return ERR_PTR(error: -EBADF);
2690
2691	if (flags & LOOKUP_LINKAT_EMPTY) {
2692	if (fd_file(f)->f_cred != current_cred() &&
2693	!ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
2694	return ERR_PTR(error: -ENOENT);
2695	}
2696
2697	dentry = fd_file(f)->f_path.dentry;
2698
2699	if (*s && unlikely(!d_can_lookup(dentry)))
2700	return ERR_PTR(error: -ENOTDIR);
2701
2702	nd->path = fd_file(f)->f_path;
2703	if (flags & LOOKUP_RCU) {
2704	nd->inode = nd->path.dentry->d_inode;
2705	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2706	} else {
2707	path_get(&nd->path);
2708	nd->inode = nd->path.dentry->d_inode;
2709	}
2710	}
2711
2712	/ For scoped-lookups we need to set the root to the dirfd as well. /
2713	if (unlikely(flags & LOOKUP_IS_SCOPED)) {
2714	nd->root = nd->path;
2715	if (flags & LOOKUP_RCU) {
2716	nd->root_seq = nd->seq;
2717	} else {
2718	path_get(&nd->root);
2719	nd->state \|= ND_ROOT_GRABBED;
2720	}
2721	}
2722	return s;
2723	}
2724
2725	static inline const char lookup_last(struct* nameidata *nd)
2726	{
2727	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2728	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
2729
2730	return walk_component(nd, flags: WALK_TRAILING);
2731	}
2732
2733	static int handle_lookup_down(struct nameidata *nd)
2734	{
2735	if (!(nd->flags & LOOKUP_RCU))
2736	dget(dentry: nd->path.dentry);
2737	nd->next_seq = nd->seq;
2738	return PTR_ERR(ptr: step_into(nd, flags: WALK_NOFOLLOW, dentry: nd->path.dentry));
2739	}
2740
2741	/ Returns 0 and nd will be valid on success; Returns error, otherwise. /
2742	static int path_lookupat(struct nameidata nd, unsigned* flags, struct path *path)
2743	{
2744	const char *s = path_init(nd, flags);
2745	int err;
2746
2747	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(ptr: s)) {
2748	err = handle_lookup_down(nd);
2749	if (unlikely(err < `0`))
2750	s = ERR_PTR(error: err);
2751	}
2752
2753	while (!(err = link_path_walk(name: s, nd)) &&
2754	(s = lookup_last(nd)) != NULL)
2755	;
2756	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2757	err = handle_lookup_down(nd);
2758	nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2759	}
2760	if (!err)
2761	err = complete_walk(nd);
2762
2763	if (!err && nd->flags & LOOKUP_DIRECTORY)
2764	if (!d_can_lookup(dentry: nd->path.dentry))
2765	err = -ENOTDIR;
2766	if (!err) {
2767	*path = nd->path;
2768	nd->path.mnt = NULL;
2769	nd->path.dentry = NULL;
2770	}
2771	terminate_walk(nd);
2772	return err;
2773	}
2774
2775	int filename_lookup(int dfd, struct filename name, unsigned* flags,
2776	struct path path, const* struct path *root)
2777	{
2778	int retval;
2779	struct nameidata nd;
2780	if (IS_ERR(ptr: name))
2781	return PTR_ERR(ptr: name);
2782	set_nameidata(p: &nd, dfd, name, root);
2783	retval = path_lookupat(nd: &nd, flags: flags \| LOOKUP_RCU, path);
2784	if (unlikely(retval == -ECHILD))
2785	retval = path_lookupat(nd: &nd, flags, path);
2786	if (unlikely(retval == -ESTALE))
2787	retval = path_lookupat(nd: &nd, flags: flags \| LOOKUP_REVAL, path);
2788
2789	if (likely(!retval))
2790	audit_inode(name, dentry: path->dentry,
2791	aflags: flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : `0`);
2792	restore_nameidata();
2793	return retval;
2794	}
2795
2796	/ Returns 0 and nd will be valid on success; Returns error, otherwise. /
2797	static int path_parentat(struct nameidata nd, unsigned* flags,
2798	struct path *parent)
2799	{
2800	const char *s = path_init(nd, flags);
2801	int err = link_path_walk(name: s, nd);
2802	if (!err)
2803	err = complete_walk(nd);
2804	if (!err) {
2805	*parent = nd->path;
2806	nd->path.mnt = NULL;
2807	nd->path.dentry = NULL;
2808	}
2809	terminate_walk(nd);
2810	return err;
2811	}
2812
2813	/ Note: this does not consume "name" /
2814	static int __filename_parentat(int dfd, struct filename *name,
2815	unsigned int flags, struct path *parent,
2816	struct qstr last, int* *type,
2817	const struct path *root)
2818	{
2819	int retval;
2820	struct nameidata nd;
2821
2822	if (IS_ERR(ptr: name))
2823	return PTR_ERR(ptr: name);
2824	set_nameidata(p: &nd, dfd, name, root);
2825	retval = path_parentat(nd: &nd, flags: flags \| LOOKUP_RCU, parent);
2826	if (unlikely(retval == -ECHILD))
2827	retval = path_parentat(nd: &nd, flags, parent);
2828	if (unlikely(retval == -ESTALE))
2829	retval = path_parentat(nd: &nd, flags: flags \| LOOKUP_REVAL, parent);
2830	if (likely(!retval)) {
2831	*last = nd.last;
2832	*type = nd.last_type;
2833	audit_inode(name, dentry: parent->dentry, AUDIT_INODE_PARENT);
2834	}
2835	restore_nameidata();
2836	return retval;
2837	}
2838
2839	static int filename_parentat(int dfd, struct filename *name,
2840	unsigned int flags, struct path *parent,
2841	struct qstr last, int* *type)
2842	{
2843	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
2844	}
2845
2846	/**
2847	* __start_dirop - begin a create or remove dirop, performing locking and lookup
2848	* @parent: the dentry of the parent in which the operation will occur
2849	* @name: a qstr holding the name within that parent
2850	* @lookup_flags: intent and other lookup flags.
2851	* @state: task state bitmask
2852	*
2853	* The lookup is performed and necessary locks are taken so that, on success,
2854	* the returned dentry can be operated on safely.
2855	* The qstr must already have the hash value calculated.
2856	*
2857	* Returns: a locked dentry, or an error.
2858	*
2859	*/
2860	static struct dentry __start_dirop(struct* dentry parent, struct* qstr *name,
2861	unsigned int lookup_flags,
2862	unsigned int state)
2863	{
2864	struct dentry *dentry;
2865	struct inode *dir = d_inode(dentry: parent);
2866
2867	if (state == TASK_KILLABLE) {
2868	int ret = down_write_killable_nested(sem: &dir->i_rwsem,
2869	subclass: I_MUTEX_PARENT);
2870	if (ret)
2871	return ERR_PTR(error: ret);
2872	} else {
2873	inode_lock_nested(inode: dir, subclass: I_MUTEX_PARENT);
2874	}
2875	dentry = lookup_one_qstr_excl(name, parent, lookup_flags);
2876	if (IS_ERR(ptr: dentry))
2877	inode_unlock(inode: dir);
2878	return dentry;
2879	}
2880
2881	struct dentry start_dirop(struct* dentry parent, struct* qstr *name,
2882	unsigned int lookup_flags)
2883	{
2884	return __start_dirop(parent, name, lookup_flags, TASK_NORMAL);
2885	}
2886
2887	/**
2888	* end_dirop - signal completion of a dirop
2889	* @de: the dentry which was returned by start_dirop or similar.
2890	*
2891	* If the de is an error, nothing happens. Otherwise any lock taken to
2892	* protect the dentry is dropped and the dentry itself is release (dput()).
2893	*/
2894	void end_dirop(struct dentry *de)
2895	{
2896	if (!IS_ERR(ptr: de)) {
2897	inode_unlock(inode: de->d_parent->d_inode);
2898	dput(de);
2899	}
2900	}
2901	EXPORT_SYMBOL(end_dirop);
2902
2903	/ does lookup, returns the object with parent locked /
2904	static struct dentry __start_removing_path(int* dfd, struct filename *name,
2905	struct path *path)
2906	{
2907	struct path parent_path __free(path_put) = {};
2908	struct dentry *d;
2909	struct qstr last;
2910	int type, error;
2911
2912	error = filename_parentat(dfd, name, flags: `0`, parent: &parent_path, last: &last, type: &type);
2913	if (error)
2914	return ERR_PTR(error);
2915	if (unlikely(type != LAST_NORM))
2916	return ERR_PTR(error: -EINVAL);
2917	/ don't fail immediately if it's r/o, at least try to report other errors /
2918	error = mnt_want_write(mnt: parent_path.mnt);
2919	d = start_dirop(parent: parent_path.dentry, name: &last, lookup_flags: `0`);
2920	if (IS_ERR(ptr: d))
2921	goto drop;
2922	if (error)
2923	goto fail;
2924	path->dentry = no_free_ptr(parent_path.dentry);
2925	path->mnt = no_free_ptr(parent_path.mnt);
2926	return d;
2927
2928	fail:
2929	end_dirop(d);
2930	d = ERR_PTR(error);
2931	drop:
2932	if (!error)
2933	mnt_drop_write(mnt: parent_path.mnt);
2934	return d;
2935	}
2936
2937	/**
2938	* kern_path_parent: lookup path returning parent and target
2939	* @name: path name
2940	* @path: path to store parent in
2941	*
2942	* The path @name should end with a normal component, not "." or ".." or "/".
2943	* A lookup is performed and if successful the parent information
2944	* is store in @parent and the dentry is returned.
2945	*
2946	* The dentry maybe negative, the parent will be positive.
2947	*
2948	* Returns: dentry or error.
2949	*/
2950	struct dentry kern_path_parent(const* char name, struct* path *path)
2951	{
2952	struct path parent_path __free(path_put) = {};
2953	struct filename *filename __free(putname) = getname_kernel(name);
2954	struct dentry *d;
2955	struct qstr last;
2956	int type, error;
2957
2958	error = filename_parentat(AT_FDCWD, name: filename, flags: `0`, parent: &parent_path, last: &last, type: &type);
2959	if (error)
2960	return ERR_PTR(error);
2961	if (unlikely(type != LAST_NORM))
2962	return ERR_PTR(error: -EINVAL);
2963
2964	d = lookup_noperm_unlocked(&last, parent_path.dentry);
2965	if (IS_ERR(ptr: d))
2966	return d;
2967	path->dentry = no_free_ptr(parent_path.dentry);
2968	path->mnt = no_free_ptr(parent_path.mnt);
2969	return d;
2970	}
2971
2972	struct dentry start_removing_path(const* char name, struct* path *path)
2973	{
2974	struct filename *filename = getname_kernel(name);
2975	struct dentry *res = __start_removing_path(AT_FDCWD, name: filename, path);
2976
2977	putname(filename);
2978	return res;
2979	}
2980
2981	struct dentry start_removing_user_path_at(int* dfd,
2982	const char __user *name,
2983	struct path *path)
2984	{
2985	struct filename *filename = getname(name);
2986	struct dentry *res = __start_removing_path(dfd, name: filename, path);
2987
2988	putname(filename);
2989	return res;
2990	}
2991	EXPORT_SYMBOL(start_removing_user_path_at);
2992
2993	int kern_path(const char name, unsigned* int flags, struct path *path)
2994	{
2995	struct filename *filename = getname_kernel(name);
2996	int ret = filename_lookup(AT_FDCWD, name: filename, flags, path, NULL);
2997
2998	putname(filename);
2999	return ret;
3000
3001	}
3002	EXPORT_SYMBOL(kern_path);
3003
3004	/**
3005	* vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
3006	* @filename: filename structure
3007	* @flags: lookup flags
3008	* @parent: pointer to struct path to fill
3009	* @last: last component
3010	* @type: type of the last component
3011	* @root: pointer to struct path of the base directory
3012	*/
3013	int vfs_path_parent_lookup(struct filename filename, unsigned* int flags,
3014	struct path parent, struct* qstr last, int* *type,
3015	const struct path *root)
3016	{
3017	return __filename_parentat(AT_FDCWD, name: filename, flags, parent, last,
3018	type, root);
3019	}
3020	EXPORT_SYMBOL(vfs_path_parent_lookup);
3021
3022	/**
3023	* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
3024	* @dentry: pointer to dentry of the base directory
3025	* @mnt: pointer to vfs mount of the base directory
3026	* @name: pointer to file name
3027	* @flags: lookup flags
3028	* @path: pointer to struct path to fill
3029	*/
3030	int vfs_path_lookup(struct dentry dentry, struct* vfsmount *mnt,
3031	const char name, unsigned* int flags,
3032	struct path *path)
3033	{
3034	struct filename *filename;
3035	struct path root = {.mnt = mnt, .dentry = dentry};
3036	int ret;
3037
3038	filename = getname_kernel(name);
3039	/ the first argument of filename_lookup() is ignored with root /
3040	ret = filename_lookup(AT_FDCWD, name: filename, flags, path, root: &root);
3041	putname(filename);
3042	return ret;
3043	}
3044	EXPORT_SYMBOL(vfs_path_lookup);
3045
3046	int lookup_noperm_common(struct qstr qname, struct* dentry *base)
3047	{
3048	const char *name = qname->name;
3049	u32 len = qname->len;
3050
3051	qname->hash = full_name_hash(base, name, len);
3052	if (!len)
3053	return -EACCES;
3054
3055	if (is_dot_dotdot(name, len))
3056	return -EACCES;
3057
3058	while (len--) {
3059	unsigned int c = (const* unsigned char *)name++;
3060	if (c == `'/'` \|\| c == `'\0'`)
3061	return -EACCES;
3062	}
3063	/*
3064	* See if the low-level filesystem might want
3065	* to use its own hash..
3066	*/
3067	if (base->d_flags & DCACHE_OP_HASH) {
3068	int err = base->d_op->d_hash(base, qname);
3069	if (err < `0`)
3070	return err;
3071	}
3072	return `0`;
3073	}
3074
3075	static int lookup_one_common(struct mnt_idmap *idmap,
3076	struct qstr qname, struct* dentry *base)
3077	{
3078	int err;
3079	err = lookup_noperm_common(qname, base);
3080	if (err < `0`)
3081	return err;
3082	return inode_permission(idmap, base->d_inode, MAY_EXEC);
3083	}
3084
3085	/**
3086	* try_lookup_noperm - filesystem helper to lookup single pathname component
3087	* @name: qstr storing pathname component to lookup
3088	* @base: base directory to lookup from
3089	*
3090	* Look up a dentry by name in the dcache, returning NULL if it does not
3091	* currently exist. The function does not try to create a dentry and if one
3092	* is found it doesn't try to revalidate it.
3093	*
3094	* Note that this routine is purely a helper for filesystem usage and should
3095	* not be called by generic code. It does no permission checking.
3096	*
3097	* No locks need be held - only a counted reference to @base is needed.
3098	*
3099	*/
3100	struct dentry try_lookup_noperm(struct* qstr name, struct* dentry *base)
3101	{
3102	int err;
3103
3104	err = lookup_noperm_common(qname: name, base);
3105	if (err)
3106	return ERR_PTR(error: err);
3107
3108	return d_lookup(base, name);
3109	}
3110	EXPORT_SYMBOL(try_lookup_noperm);
3111
3112	/**
3113	* lookup_noperm - filesystem helper to lookup single pathname component
3114	* @name: qstr storing pathname component to lookup
3115	* @base: base directory to lookup from
3116	*
3117	* Note that this routine is purely a helper for filesystem usage and should
3118	* not be called by generic code. It does no permission checking.
3119	*
3120	* The caller must hold base->i_rwsem.
3121	*/
3122	struct dentry lookup_noperm(struct* qstr name, struct* dentry *base)
3123	{
3124	struct dentry *dentry;
3125	int err;
3126
3127	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
3128
3129	err = lookup_noperm_common(qname: name, base);
3130	if (err)
3131	return ERR_PTR(error: err);
3132
3133	dentry = lookup_dcache(name, dir: base, flags: `0`);
3134	return dentry ? dentry : __lookup_slow(name, dir: base, flags: `0`);
3135	}
3136	EXPORT_SYMBOL(lookup_noperm);
3137
3138	/**
3139	* lookup_one - lookup single pathname component
3140	* @idmap: idmap of the mount the lookup is performed from
3141	* @name: qstr holding pathname component to lookup
3142	* @base: base directory to lookup from
3143	*
3144	* This can be used for in-kernel filesystem clients such as file servers.
3145	*
3146	* The caller must hold base->i_rwsem.
3147	*/
3148	struct dentry lookup_one(struct* mnt_idmap idmap, struct* qstr *name,
3149	struct dentry *base)
3150	{
3151	struct dentry *dentry;
3152	int err;
3153
3154	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
3155
3156	err = lookup_one_common(idmap, qname: name, base);
3157	if (err)
3158	return ERR_PTR(error: err);
3159
3160	dentry = lookup_dcache(name, dir: base, flags: `0`);
3161	return dentry ? dentry : __lookup_slow(name, dir: base, flags: `0`);
3162	}
3163	EXPORT_SYMBOL(lookup_one);
3164
3165	/**
3166	* lookup_one_unlocked - lookup single pathname component
3167	* @idmap: idmap of the mount the lookup is performed from
3168	* @name: qstr olding pathname component to lookup
3169	* @base: base directory to lookup from
3170	*
3171	* This can be used for in-kernel filesystem clients such as file servers.
3172	*
3173	* Unlike lookup_one, it should be called without the parent
3174	* i_rwsem held, and will take the i_rwsem itself if necessary.
3175	*/
3176	struct dentry lookup_one_unlocked(struct* mnt_idmap idmap, struct* qstr *name,
3177	struct dentry *base)
3178	{
3179	int err;
3180	struct dentry *ret;
3181
3182	err = lookup_one_common(idmap, qname: name, base);
3183	if (err)
3184	return ERR_PTR(error: err);
3185
3186	ret = lookup_dcache(name, dir: base, flags: `0`);
3187	if (!ret)
3188	ret = lookup_slow(name, dir: base, flags: `0`);
3189	return ret;
3190	}
3191	EXPORT_SYMBOL(lookup_one_unlocked);
3192
3193	/**
3194	* lookup_one_positive_killable - lookup single pathname component
3195	* @idmap: idmap of the mount the lookup is performed from
3196	* @name: qstr olding pathname component to lookup
3197	* @base: base directory to lookup from
3198	*
3199	* This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
3200	* known positive or ERR_PTR(). This is what most of the users want.
3201	*
3202	* Note that pinned negative with unlocked parent _can_ become positive at any
3203	* time, so callers of lookup_one_unlocked() need to be very careful; pinned
3204	* positives have >d_inode stable, so this one avoids such problems.
3205	*
3206	* This can be used for in-kernel filesystem clients such as file servers.
3207	*
3208	* It should be called without the parent i_rwsem held, and will take
3209	* the i_rwsem itself if necessary. If a fatal signal is pending or
3210	* delivered, it will return %-EINTR if the lock is needed.
3211	*/
3212	struct dentry lookup_one_positive_killable(struct* mnt_idmap *idmap,
3213	struct qstr *name,
3214	struct dentry *base)
3215	{
3216	int err;
3217	struct dentry *ret;
3218
3219	err = lookup_one_common(idmap, qname: name, base);
3220	if (err)
3221	return ERR_PTR(error: err);
3222
3223	ret = lookup_dcache(name, dir: base, flags: `0`);
3224	if (!ret)
3225	ret = lookup_slow_killable(name, dir: base, flags: `0`);
3226	if (!IS_ERR(ptr: ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3227	dput(ret);
3228	ret = ERR_PTR(error: -ENOENT);
3229	}
3230	return ret;
3231	}
3232	EXPORT_SYMBOL(lookup_one_positive_killable);
3233
3234	/**
3235	* lookup_one_positive_unlocked - lookup single pathname component
3236	* @idmap: idmap of the mount the lookup is performed from
3237	* @name: qstr holding pathname component to lookup
3238	* @base: base directory to lookup from
3239	*
3240	* This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
3241	* known positive or ERR_PTR(). This is what most of the users want.
3242	*
3243	* Note that pinned negative with unlocked parent _can_ become positive at any
3244	* time, so callers of lookup_one_unlocked() need to be very careful; pinned
3245	* positives have >d_inode stable, so this one avoids such problems.
3246	*
3247	* This can be used for in-kernel filesystem clients such as file servers.
3248	*
3249	* The helper should be called without i_rwsem held.
3250	*/
3251	struct dentry lookup_one_positive_unlocked(struct* mnt_idmap *idmap,
3252	struct qstr *name,
3253	struct dentry *base)
3254	{
3255	struct dentry *ret = lookup_one_unlocked(idmap, name, base);
3256
3257	if (!IS_ERR(ptr: ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3258	dput(ret);
3259	ret = ERR_PTR(error: -ENOENT);
3260	}
3261	return ret;
3262	}
3263	EXPORT_SYMBOL(lookup_one_positive_unlocked);
3264
3265	/**
3266	* lookup_noperm_unlocked - filesystem helper to lookup single pathname component
3267	* @name: pathname component to lookup
3268	* @base: base directory to lookup from
3269	*
3270	* Note that this routine is purely a helper for filesystem usage and should
3271	* not be called by generic code. It does no permission checking.
3272	*
3273	* Unlike lookup_noperm(), it should be called without the parent
3274	* i_rwsem held, and will take the i_rwsem itself if necessary.
3275	*
3276	* Unlike try_lookup_noperm() it does revalidate the dentry if it already
3277	* existed.
3278	*/
3279	struct dentry lookup_noperm_unlocked(struct* qstr name, struct* dentry *base)
3280	{
3281	struct dentry *ret;
3282	int err;
3283
3284	err = lookup_noperm_common(qname: name, base);
3285	if (err)
3286	return ERR_PTR(error: err);
3287
3288	ret = lookup_dcache(name, dir: base, flags: `0`);
3289	if (!ret)
3290	ret = lookup_slow(name, dir: base, flags: `0`);
3291	return ret;
3292	}
3293	EXPORT_SYMBOL(lookup_noperm_unlocked);
3294
3295	/*
3296	* Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT)
3297	* on negatives. Returns known positive or ERR_PTR(); that's what
3298	* most of the users want. Note that pinned negative with unlocked parent
3299	* _can_ become positive at any time, so callers of lookup_noperm_unlocked()
3300	* need to be very careful; pinned positives have ->d_inode stable, so
3301	* this one avoids such problems.
3302	*/
3303	struct dentry lookup_noperm_positive_unlocked(struct* qstr *name,
3304	struct dentry *base)
3305	{
3306	struct dentry *ret;
3307
3308	ret = lookup_noperm_unlocked(name, base);
3309	if (!IS_ERR(ptr: ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3310	dput(ret);
3311	ret = ERR_PTR(error: -ENOENT);
3312	}
3313	return ret;
3314	}
3315	EXPORT_SYMBOL(lookup_noperm_positive_unlocked);
3316
3317	/**
3318	* start_creating - prepare to create a given name with permission checking
3319	* @idmap: idmap of the mount
3320	* @parent: directory in which to prepare to create the name
3321	* @name: the name to be created
3322	*
3323	* Locks are taken and a lookup is performed prior to creating
3324	* an object in a directory. Permission checking (MAY_EXEC) is performed
3325	* against @idmap.
3326	*
3327	* If the name already exists, a positive dentry is returned, so
3328	* behaviour is similar to O_CREAT without O_EXCL, which doesn't fail
3329	* with -EEXIST.
3330	*
3331	* Returns: a negative or positive dentry, or an error.
3332	*/
3333	struct dentry start_creating(struct* mnt_idmap idmap, struct* dentry *parent,
3334	struct qstr *name)
3335	{
3336	int err = lookup_one_common(idmap, qname: name, base: parent);
3337
3338	if (err)
3339	return ERR_PTR(error: err);
3340	return start_dirop(parent, name, LOOKUP_CREATE);
3341	}
3342	EXPORT_SYMBOL(start_creating);
3343
3344	/**
3345	* start_removing - prepare to remove a given name with permission checking
3346	* @idmap: idmap of the mount
3347	* @parent: directory in which to find the name
3348	* @name: the name to be removed
3349	*
3350	* Locks are taken and a lookup in performed prior to removing
3351	* an object from a directory. Permission checking (MAY_EXEC) is performed
3352	* against @idmap.
3353	*
3354	* If the name doesn't exist, an error is returned.
3355	*
3356	* end_removing() should be called when removal is complete, or aborted.
3357	*
3358	* Returns: a positive dentry, or an error.
3359	*/
3360	struct dentry start_removing(struct* mnt_idmap idmap, struct* dentry *parent,
3361	struct qstr *name)
3362	{
3363	int err = lookup_one_common(idmap, qname: name, base: parent);
3364
3365	if (err)
3366	return ERR_PTR(error: err);
3367	return start_dirop(parent, name, lookup_flags: `0`);
3368	}
3369	EXPORT_SYMBOL(start_removing);
3370
3371	/**
3372	* start_creating_killable - prepare to create a given name with permission checking
3373	* @idmap: idmap of the mount
3374	* @parent: directory in which to prepare to create the name
3375	* @name: the name to be created
3376	*
3377	* Locks are taken and a lookup in performed prior to creating
3378	* an object in a directory. Permission checking (MAY_EXEC) is performed
3379	* against @idmap.
3380	*
3381	* If the name already exists, a positive dentry is returned.
3382	*
3383	* If a signal is received or was already pending, the function aborts
3384	* with -EINTR;
3385	*
3386	* Returns: a negative or positive dentry, or an error.
3387	*/
3388	struct dentry start_creating_killable(struct* mnt_idmap *idmap,
3389	struct dentry *parent,
3390	struct qstr *name)
3391	{
3392	int err = lookup_one_common(idmap, qname: name, base: parent);
3393
3394	if (err)
3395	return ERR_PTR(error: err);
3396	return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE);
3397	}
3398	EXPORT_SYMBOL(start_creating_killable);
3399
3400	/**
3401	* start_removing_killable - prepare to remove a given name with permission checking
3402	* @idmap: idmap of the mount
3403	* @parent: directory in which to find the name
3404	* @name: the name to be removed
3405	*
3406	* Locks are taken and a lookup in performed prior to removing
3407	* an object from a directory. Permission checking (MAY_EXEC) is performed
3408	* against @idmap.
3409	*
3410	* If the name doesn't exist, an error is returned.
3411	*
3412	* end_removing() should be called when removal is complete, or aborted.
3413	*
3414	* If a signal is received or was already pending, the function aborts
3415	* with -EINTR;
3416	*
3417	* Returns: a positive dentry, or an error.
3418	*/
3419	struct dentry start_removing_killable(struct* mnt_idmap *idmap,
3420	struct dentry *parent,
3421	struct qstr *name)
3422	{
3423	int err = lookup_one_common(idmap, qname: name, base: parent);
3424
3425	if (err)
3426	return ERR_PTR(error: err);
3427	return __start_dirop(parent, name, lookup_flags: `0`, TASK_KILLABLE);
3428	}
3429	EXPORT_SYMBOL(start_removing_killable);
3430
3431	/**
3432	* start_creating_noperm - prepare to create a given name without permission checking
3433	* @parent: directory in which to prepare to create the name
3434	* @name: the name to be created
3435	*
3436	* Locks are taken and a lookup in performed prior to creating
3437	* an object in a directory.
3438	*
3439	* If the name already exists, a positive dentry is returned.
3440	*
3441	* Returns: a negative or positive dentry, or an error.
3442	*/
3443	struct dentry start_creating_noperm(struct* dentry *parent,
3444	struct qstr *name)
3445	{
3446	int err = lookup_noperm_common(qname: name, base: parent);
3447
3448	if (err)
3449	return ERR_PTR(error: err);
3450	return start_dirop(parent, name, LOOKUP_CREATE);
3451	}
3452	EXPORT_SYMBOL(start_creating_noperm);
3453
3454	/**
3455	* start_removing_noperm - prepare to remove a given name without permission checking
3456	* @parent: directory in which to find the name
3457	* @name: the name to be removed
3458	*
3459	* Locks are taken and a lookup in performed prior to removing
3460	* an object from a directory.
3461	*
3462	* If the name doesn't exist, an error is returned.
3463	*
3464	* end_removing() should be called when removal is complete, or aborted.
3465	*
3466	* Returns: a positive dentry, or an error.
3467	*/
3468	struct dentry start_removing_noperm(struct* dentry *parent,
3469	struct qstr *name)
3470	{
3471	int err = lookup_noperm_common(qname: name, base: parent);
3472
3473	if (err)
3474	return ERR_PTR(error: err);
3475	return start_dirop(parent, name, lookup_flags: `0`);
3476	}
3477	EXPORT_SYMBOL(start_removing_noperm);
3478
3479	/**
3480	* start_creating_dentry - prepare to create a given dentry
3481	* @parent: directory from which dentry should be removed
3482	* @child: the dentry to be removed
3483	*
3484	* A lock is taken to protect the dentry again other dirops and
3485	* the validity of the dentry is checked: correct parent and still hashed.
3486	*
3487	* If the dentry is valid and negative a reference is taken and
3488	* returned. If not an error is returned.
3489	*
3490	* end_creating() should be called when creation is complete, or aborted.
3491	*
3492	* Returns: the valid dentry, or an error.
3493	*/
3494	struct dentry start_creating_dentry(struct* dentry *parent,
3495	struct dentry *child)
3496	{
3497	inode_lock_nested(inode: parent->d_inode, subclass: I_MUTEX_PARENT);
3498	if (unlikely(IS_DEADDIR(parent->d_inode) \|\|
3499	child->d_parent != parent \|\|
3500	d_unhashed(child))) {
3501	inode_unlock(inode: parent->d_inode);
3502	return ERR_PTR(error: -EINVAL);
3503	}
3504	if (d_is_positive(dentry: child)) {
3505	inode_unlock(inode: parent->d_inode);
3506	return ERR_PTR(error: -EEXIST);
3507	}
3508	return dget(dentry: child);
3509	}
3510	EXPORT_SYMBOL(start_creating_dentry);
3511
3512	/**
3513	* start_removing_dentry - prepare to remove a given dentry
3514	* @parent: directory from which dentry should be removed
3515	* @child: the dentry to be removed
3516	*
3517	* A lock is taken to protect the dentry again other dirops and
3518	* the validity of the dentry is checked: correct parent and still hashed.
3519	*
3520	* If the dentry is valid and positive, a reference is taken and
3521	* returned. If not an error is returned.
3522	*
3523	* end_removing() should be called when removal is complete, or aborted.
3524	*
3525	* Returns: the valid dentry, or an error.
3526	*/
3527	struct dentry start_removing_dentry(struct* dentry *parent,
3528	struct dentry *child)
3529	{
3530	inode_lock_nested(inode: parent->d_inode, subclass: I_MUTEX_PARENT);
3531	if (unlikely(IS_DEADDIR(parent->d_inode) \|\|
3532	child->d_parent != parent \|\|
3533	d_unhashed(child))) {
3534	inode_unlock(inode: parent->d_inode);
3535	return ERR_PTR(error: -EINVAL);
3536	}
3537	if (d_is_negative(dentry: child)) {
3538	inode_unlock(inode: parent->d_inode);
3539	return ERR_PTR(error: -ENOENT);
3540	}
3541	return dget(dentry: child);
3542	}
3543	EXPORT_SYMBOL(start_removing_dentry);
3544
3545	#ifdef CONFIG_UNIX98_PTYS
3546	int path_pts(struct path *path)
3547	{
3548	/ Find something mounted on "pts" in the same directory as*
3549	* the input path.
3550	*/
3551	struct dentry *parent = dget_parent(dentry: path->dentry);
3552	struct dentry *child;
3553	struct qstr this = QSTR_INIT("pts", `3`);
3554
3555	if (unlikely(!path_connected(path->mnt, parent))) {
3556	dput(parent);
3557	return -ENOENT;
3558	}
3559	dput(path->dentry);
3560	path->dentry = parent;
3561	child = d_hash_and_lookup(parent, &this);
3562	if (IS_ERR_OR_NULL(ptr: child))
3563	return -ENOENT;
3564
3565	path->dentry = child;
3566	dput(parent);
3567	follow_down(path, `0`);
3568	return `0`;
3569	}
3570	#endif
3571
3572	int user_path_at(int dfd, const char __user name, unsigned* flags,
3573	struct path *path)
3574	{
3575	struct filename *filename = getname_flags(filename: name, flags);
3576	int ret = filename_lookup(dfd, name: filename, flags, path, NULL);
3577
3578	putname(filename);
3579	return ret;
3580	}
3581	EXPORT_SYMBOL(user_path_at);
3582
3583	int __check_sticky(struct mnt_idmap idmap, struct* inode *dir,
3584	struct inode *inode)
3585	{
3586	kuid_t fsuid = current_fsuid();
3587
3588	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode), kuid: fsuid))
3589	return `0`;
3590	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode: dir), kuid: fsuid))
3591	return `0`;
3592	return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
3593	}
3594	EXPORT_SYMBOL(__check_sticky);
3595
3596	/*
3597	* Check whether we can remove a link victim from directory dir, check
3598	* whether the type of victim is right.
3599	* 1. We can't do it if dir is read-only (done in permission())
3600	* 2. We should have write and exec permissions on dir
3601	* 3. We can't remove anything from append-only dir
3602	* 4. We can't do anything with immutable dir (done in permission())
3603	* 5. If the sticky bit on dir is set we should either
3604	* a. be owner of dir, or
3605	* b. be owner of victim, or
3606	* c. have CAP_FOWNER capability
3607	* 6. If the victim is append-only or immutable we can't do antyhing with
3608	* links pointing to it.
3609	* 7. If the victim has an unknown uid or gid we can't change the inode.
3610	* 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
3611	* 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
3612	* 10. We can't remove a root or mountpoint.
3613	* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
3614	* nfs_async_unlink().
3615	*/
3616	static int may_delete(struct mnt_idmap idmap, struct* inode *dir,
3617	struct dentry *victim, bool isdir)
3618	{
3619	struct inode *inode = d_backing_inode(upper: victim);
3620	int error;
3621
3622	if (d_is_negative(dentry: victim))
3623	return -ENOENT;
3624	BUG_ON(!inode);
3625
3626	BUG_ON(victim->d_parent->d_inode != dir);
3627
3628	/ Inode writeback is not safe when the uid or gid are invalid. /
3629	if (!vfsuid_valid(uid: i_uid_into_vfsuid(idmap, inode)) \|\|
3630	!vfsgid_valid(gid: i_gid_into_vfsgid(idmap, inode)))
3631	return -EOVERFLOW;
3632
3633	audit_inode_child(parent: dir, dentry: victim, AUDIT_TYPE_CHILD_DELETE);
3634
3635	error = inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3636	if (error)
3637	return error;
3638	if (IS_APPEND(dir))
3639	return -EPERM;
3640
3641	if (check_sticky(idmap, dir, inode) \|\| IS_APPEND(inode) \|\|
3642	IS_IMMUTABLE(inode) \|\| IS_SWAPFILE(inode) \|\|
3643	HAS_UNMAPPED_ID(idmap, inode))
3644	return -EPERM;
3645	if (isdir) {
3646	if (!d_is_dir(dentry: victim))
3647	return -ENOTDIR;
3648	if (IS_ROOT(victim))
3649	return -EBUSY;
3650	} else if (d_is_dir(dentry: victim))
3651	return -EISDIR;
3652	if (IS_DEADDIR(dir))
3653	return -ENOENT;
3654	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
3655	return -EBUSY;
3656	return `0`;
3657	}
3658
3659	/ Check whether we can create an object with dentry child in directory*
3660	* dir.
3661	* 1. We can't do it if child already exists (open has special treatment for
3662	* this case, but since we are inlined it's OK)
3663	* 2. We can't do it if dir is read-only (done in permission())
3664	* 3. We can't do it if the fs can't represent the fsuid or fsgid.
3665	* 4. We should have write and exec permissions on dir
3666	* 5. We can't do it if dir is immutable (done in permission())
3667	*/
3668	static inline int may_create(struct mnt_idmap *idmap,
3669	struct inode dir, struct* dentry *child)
3670	{
3671	audit_inode_child(parent: dir, dentry: child, AUDIT_TYPE_CHILD_CREATE);
3672	if (child->d_inode)
3673	return -EEXIST;
3674	if (IS_DEADDIR(dir))
3675	return -ENOENT;
3676	if (!fsuidgid_has_mapping(sb: dir->i_sb, idmap))
3677	return -EOVERFLOW;
3678
3679	return inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3680	}
3681
3682	// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
3683	static struct dentry lock_two_directories(struct* dentry p1, struct* dentry *p2)
3684	{
3685	struct dentry p = p1, q = p2, *r;
3686
3687	while ((r = p->d_parent) != p2 && r != p)
3688	p = r;
3689	if (r == p2) {
3690	// p is a child of p2 and an ancestor of p1 or p1 itself
3691	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3692	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT2);
3693	return p;
3694	}
3695	// p is the root of connected component that contains p1
3696	// p2 does not occur on the path from p to p1
3697	while ((r = q->d_parent) != p1 && r != p && r != q)
3698	q = r;
3699	if (r == p1) {
3700	// q is a child of p1 and an ancestor of p2 or p2 itself
3701	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3702	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT2);
3703	return q;
3704	} else if (likely(r == p)) {
3705	// both p2 and p1 are descendents of p
3706	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3707	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT2);
3708	return NULL;
3709	} else { // no common ancestor at the time we'd been called
3710	mutex_unlock(lock: &p1->d_sb->s_vfs_rename_mutex);
3711	return ERR_PTR(error: -EXDEV);
3712	}
3713	}
3714
3715	/*
3716	* p1 and p2 should be directories on the same fs.
3717	*/
3718	struct dentry lock_rename(struct* dentry p1, struct* dentry *p2)
3719	{
3720	if (p1 == p2) {
3721	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3722	return NULL;
3723	}
3724
3725	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
3726	return lock_two_directories(p1, p2);
3727	}
3728	EXPORT_SYMBOL(lock_rename);
3729
3730	/*
3731	* c1 and p2 should be on the same fs.
3732	*/
3733	struct dentry lock_rename_child(struct* dentry c1, struct* dentry *p2)
3734	{
3735	if (READ_ONCE(c1->d_parent) == p2) {
3736	/*
3737	* hopefully won't need to touch ->s_vfs_rename_mutex at all.
3738	*/
3739	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3740	/*
3741	* now that p2 is locked, nobody can move in or out of it,
3742	* so the test below is safe.
3743	*/
3744	if (likely(c1->d_parent == p2))
3745	return NULL;
3746
3747	/*
3748	* c1 got moved out of p2 while we'd been taking locks;
3749	* unlock and fall back to slow case.
3750	*/
3751	inode_unlock(inode: p2->d_inode);
3752	}
3753
3754	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
3755	/*
3756	* nobody can move out of any directories on this fs.
3757	*/
3758	if (likely(c1->d_parent != p2))
3759	return lock_two_directories(p1: c1->d_parent, p2);
3760
3761	/*
3762	* c1 got moved into p2 while we were taking locks;
3763	* we need p2 locked and ->s_vfs_rename_mutex unlocked,
3764	* for consistency with lock_rename().
3765	*/
3766	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3767	mutex_unlock(lock: &c1->d_sb->s_vfs_rename_mutex);
3768	return NULL;
3769	}
3770	EXPORT_SYMBOL(lock_rename_child);
3771
3772	void unlock_rename(struct dentry p1, struct* dentry *p2)
3773	{
3774	inode_unlock(inode: p1->d_inode);
3775	if (p1 != p2) {
3776	inode_unlock(inode: p2->d_inode);
3777	mutex_unlock(lock: &p1->d_sb->s_vfs_rename_mutex);
3778	}
3779	}
3780	EXPORT_SYMBOL(unlock_rename);
3781
3782	/**
3783	* __start_renaming - lookup and lock names for rename
3784	* @rd: rename data containing parents and flags, and
3785	* for receiving found dentries
3786	* @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3787	* LOOKUP_NO_SYMLINKS etc).
3788	* @old_last: name of object in @rd.old_parent
3789	* @new_last: name of object in @rd.new_parent
3790	*
3791	* Look up two names and ensure locks are in place for
3792	* rename.
3793	*
3794	* On success the found dentries are stored in @rd.old_dentry,
3795	* @rd.new_dentry and an extra ref is taken on @rd.old_parent.
3796	* These references and the lock are dropped by end_renaming().
3797	*
3798	* The passed in qstrs must have the hash calculated, and no permission
3799	* checking is performed.
3800	*
3801	* Returns: zero or an error.
3802	*/
3803	static int
3804	__start_renaming(struct renamedata rd, int* lookup_flags,
3805	struct qstr old_last, struct* qstr *new_last)
3806	{
3807	struct dentry *trap;
3808	struct dentry d1, d2;
3809	int target_flags = LOOKUP_RENAME_TARGET \| LOOKUP_CREATE;
3810	int err;
3811
3812	if (rd->flags & RENAME_EXCHANGE)
3813	target_flags = `0`;
3814	if (rd->flags & RENAME_NOREPLACE)
3815	target_flags \|= LOOKUP_EXCL;
3816
3817	trap = lock_rename(rd->old_parent, rd->new_parent);
3818	if (IS_ERR(ptr: trap))
3819	return PTR_ERR(ptr: trap);
3820
3821	d1 = lookup_one_qstr_excl(old_last, rd->old_parent,
3822	lookup_flags);
3823	err = PTR_ERR(ptr: d1);
3824	if (IS_ERR(ptr: d1))
3825	goto out_unlock;
3826
3827	d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
3828	lookup_flags \| target_flags);
3829	err = PTR_ERR(ptr: d2);
3830	if (IS_ERR(ptr: d2))
3831	goto out_dput_d1;
3832
3833	if (d1 == trap) {
3834	/ source is an ancestor of target /
3835	err = -EINVAL;
3836	goto out_dput_d2;
3837	}
3838
3839	if (d2 == trap) {
3840	/ target is an ancestor of source /
3841	if (rd->flags & RENAME_EXCHANGE)
3842	err = -EINVAL;
3843	else
3844	err = -ENOTEMPTY;
3845	goto out_dput_d2;
3846	}
3847
3848	rd->old_dentry = d1;
3849	rd->new_dentry = d2;
3850	dget(dentry: rd->old_parent);
3851	return `0`;
3852
3853	out_dput_d2:
3854	dput(d2);
3855	out_dput_d1:
3856	dput(d1);
3857	out_unlock:
3858	unlock_rename(rd->old_parent, rd->new_parent);
3859	return err;
3860	}
3861
3862	/**
3863	* start_renaming - lookup and lock names for rename with permission checking
3864	* @rd: rename data containing parents and flags, and
3865	* for receiving found dentries
3866	* @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3867	* LOOKUP_NO_SYMLINKS etc).
3868	* @old_last: name of object in @rd.old_parent
3869	* @new_last: name of object in @rd.new_parent
3870	*
3871	* Look up two names and ensure locks are in place for
3872	* rename.
3873	*
3874	* On success the found dentries are stored in @rd.old_dentry,
3875	* @rd.new_dentry. Also the refcount on @rd->old_parent is increased.
3876	* These references and the lock are dropped by end_renaming().
3877	*
3878	* The passed in qstrs need not have the hash calculated, and basic
3879	* eXecute permission checking is performed against @rd.mnt_idmap.
3880	*
3881	* Returns: zero or an error.
3882	*/
3883	int start_renaming(struct renamedata rd, int* lookup_flags,
3884	struct qstr old_last, struct* qstr *new_last)
3885	{
3886	int err;
3887
3888	err = lookup_one_common(idmap: rd->mnt_idmap, qname: old_last, base: rd->old_parent);
3889	if (err)
3890	return err;
3891	err = lookup_one_common(idmap: rd->mnt_idmap, qname: new_last, base: rd->new_parent);
3892	if (err)
3893	return err;
3894	return __start_renaming(rd, lookup_flags, old_last, new_last);
3895	}
3896	EXPORT_SYMBOL(start_renaming);
3897
3898	static int
3899	__start_renaming_dentry(struct renamedata rd, int* lookup_flags,
3900	struct dentry old_dentry, struct* qstr *new_last)
3901	{
3902	struct dentry *trap;
3903	struct dentry *d2;
3904	int target_flags = LOOKUP_RENAME_TARGET \| LOOKUP_CREATE;
3905	int err;
3906
3907	if (rd->flags & RENAME_EXCHANGE)
3908	target_flags = `0`;
3909	if (rd->flags & RENAME_NOREPLACE)
3910	target_flags \|= LOOKUP_EXCL;
3911
3912	/ Already have the dentry - need to be sure to lock the correct parent /
3913	trap = lock_rename_child(old_dentry, rd->new_parent);
3914	if (IS_ERR(ptr: trap))
3915	return PTR_ERR(ptr: trap);
3916	if (d_unhashed(dentry: old_dentry) \|\|
3917	(rd->old_parent && rd->old_parent != old_dentry->d_parent)) {
3918	/ dentry was removed, or moved and explicit parent requested /
3919	err = -EINVAL;
3920	goto out_unlock;
3921	}
3922
3923	d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
3924	lookup_flags \| target_flags);
3925	err = PTR_ERR(ptr: d2);
3926	if (IS_ERR(ptr: d2))
3927	goto out_unlock;
3928
3929	if (old_dentry == trap) {
3930	/ source is an ancestor of target /
3931	err = -EINVAL;
3932	goto out_dput_d2;
3933	}
3934
3935	if (d2 == trap) {
3936	/ target is an ancestor of source /
3937	if (rd->flags & RENAME_EXCHANGE)
3938	err = -EINVAL;
3939	else
3940	err = -ENOTEMPTY;
3941	goto out_dput_d2;
3942	}
3943
3944	rd->old_dentry = dget(dentry: old_dentry);
3945	rd->new_dentry = d2;
3946	rd->old_parent = dget(dentry: old_dentry->d_parent);
3947	return `0`;
3948
3949	out_dput_d2:
3950	dput(d2);
3951	out_unlock:
3952	unlock_rename(old_dentry->d_parent, rd->new_parent);
3953	return err;
3954	}
3955
3956	/**
3957	* start_renaming_dentry - lookup and lock name for rename with permission checking
3958	* @rd: rename data containing parents and flags, and
3959	* for receiving found dentries
3960	* @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
3961	* LOOKUP_NO_SYMLINKS etc).
3962	* @old_dentry: dentry of name to move
3963	* @new_last: name of target in @rd.new_parent
3964	*
3965	* Look up target name and ensure locks are in place for
3966	* rename.
3967	*
3968	* On success the found dentry is stored in @rd.new_dentry and
3969	* @rd.old_parent is confirmed to be the parent of @old_dentry. If it
3970	* was originally %NULL, it is set. In either case a reference is taken
3971	* so that end_renaming() can have a stable reference to unlock.
3972	*
3973	* References and the lock can be dropped with end_renaming()
3974	*
3975	* The passed in qstr need not have the hash calculated, and basic
3976	* eXecute permission checking is performed against @rd.mnt_idmap.
3977	*
3978	* Returns: zero or an error.
3979	*/
3980	int start_renaming_dentry(struct renamedata rd, int* lookup_flags,
3981	struct dentry old_dentry, struct* qstr *new_last)
3982	{
3983	int err;
3984
3985	err = lookup_one_common(idmap: rd->mnt_idmap, qname: new_last, base: rd->new_parent);
3986	if (err)
3987	return err;
3988	return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last);
3989	}
3990	EXPORT_SYMBOL(start_renaming_dentry);
3991
3992	/**
3993	* start_renaming_two_dentries - Lock to dentries in given parents for rename
3994	* @rd: rename data containing parent
3995	* @old_dentry: dentry of name to move
3996	* @new_dentry: dentry to move to
3997	*
3998	* Ensure locks are in place for rename and check parentage is still correct.
3999	*
4000	* On success the two dentries are stored in @rd.old_dentry and
4001	* @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to
4002	* be the parents of the dentries.
4003	*
4004	* References and the lock can be dropped with end_renaming()
4005	*
4006	* Returns: zero or an error.
4007	*/
4008	int
4009	start_renaming_two_dentries(struct renamedata *rd,
4010	struct dentry old_dentry, struct* dentry *new_dentry)
4011	{
4012	struct dentry *trap;
4013	int err;
4014
4015	/ Already have the dentry - need to be sure to lock the correct parent /
4016	trap = lock_rename_child(old_dentry, rd->new_parent);
4017	if (IS_ERR(ptr: trap))
4018	return PTR_ERR(ptr: trap);
4019	err = -EINVAL;
4020	if (d_unhashed(dentry: old_dentry) \|\|
4021	(rd->old_parent && rd->old_parent != old_dentry->d_parent))
4022	/ old_dentry was removed, or moved and explicit parent requested /
4023	goto out_unlock;
4024	if (d_unhashed(dentry: new_dentry) \|\|
4025	rd->new_parent != new_dentry->d_parent)
4026	/ new_dentry was removed or moved /
4027	goto out_unlock;
4028
4029	if (old_dentry == trap)
4030	/ source is an ancestor of target /
4031	goto out_unlock;
4032
4033	if (new_dentry == trap) {
4034	/ target is an ancestor of source /
4035	if (rd->flags & RENAME_EXCHANGE)
4036	err = -EINVAL;
4037	else
4038	err = -ENOTEMPTY;
4039	goto out_unlock;
4040	}
4041
4042	err = -EEXIST;
4043	if (d_is_positive(dentry: new_dentry) && (rd->flags & RENAME_NOREPLACE))
4044	goto out_unlock;
4045
4046	rd->old_dentry = dget(dentry: old_dentry);
4047	rd->new_dentry = dget(dentry: new_dentry);
4048	rd->old_parent = dget(dentry: old_dentry->d_parent);
4049	return `0`;
4050
4051	out_unlock:
4052	unlock_rename(old_dentry->d_parent, rd->new_parent);
4053	return err;
4054	}
4055	EXPORT_SYMBOL(start_renaming_two_dentries);
4056
4057	void end_renaming(struct renamedata *rd)
4058	{
4059	unlock_rename(rd->old_parent, rd->new_parent);
4060	dput(rd->old_dentry);
4061	dput(rd->new_dentry);
4062	dput(rd->old_parent);
4063	}
4064	EXPORT_SYMBOL(end_renaming);
4065
4066	/**
4067	* vfs_prepare_mode - prepare the mode to be used for a new inode
4068	* @idmap: idmap of the mount the inode was found from
4069	* @dir: parent directory of the new inode
4070	* @mode: mode of the new inode
4071	* @mask_perms: allowed permission by the vfs
4072	* @type: type of file to be created
4073	*
4074	* This helper consolidates and enforces vfs restrictions on the @mode of a new
4075	* object to be created.
4076	*
4077	* Umask stripping depends on whether the filesystem supports POSIX ACLs (see
4078	* the kernel documentation for mode_strip_umask()). Moving umask stripping
4079	* after setgid stripping allows the same ordering for both non-POSIX ACL and
4080	* POSIX ACL supporting filesystems.
4081	*
4082	* Note that it's currently valid for @type to be 0 if a directory is created.
4083	* Filesystems raise that flag individually and we need to check whether each
4084	* filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
4085	* non-zero type.
4086	*
4087	* Returns: mode to be passed to the filesystem
4088	*/
4089	static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
4090	const struct inode *dir, umode_t mode,
4091	umode_t mask_perms, umode_t type)
4092	{
4093	mode = mode_strip_sgid(idmap, dir, mode);
4094	mode = mode_strip_umask(dir, mode);
4095
4096	/*
4097	* Apply the vfs mandated allowed permission mask and set the type of
4098	* file to be created before we call into the filesystem.
4099	*/
4100	mode &= (mask_perms & ~S_IFMT);
4101	mode \|= (type & S_IFMT);
4102
4103	return mode;
4104	}
4105
4106	/**
4107	* vfs_create - create new file
4108	* @idmap: idmap of the mount the inode was found from
4109	* @dentry: dentry of the child file
4110	* @mode: mode of the child file
4111	* @di: returns parent inode, if the inode is delegated.
4112	*
4113	* Create a new file.
4114	*
4115	* If the inode has been found through an idmapped mount the idmap of
4116	* the vfsmount must be passed through @idmap. This function will then take
4117	* care to map the inode according to @idmap before checking permissions.
4118	* On non-idmapped mounts or if permission checking is to be performed on the
4119	* raw inode simply pass @nop_mnt_idmap.
4120	*/
4121	int vfs_create(struct mnt_idmap idmap, struct* dentry *dentry, umode_t mode,
4122	struct delegated_inode *di)
4123	{
4124	struct inode *dir = d_inode(dentry: dentry->d_parent);
4125	int error;
4126
4127	error = may_create(idmap, dir, child: dentry);
4128	if (error)
4129	return error;
4130
4131	if (!dir->i_op->create)
4132	return -EACCES; / shouldn't it be ENOSYS? /
4133
4134	mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
4135	error = security_inode_create(dir, dentry, mode);
4136	if (error)
4137	return error;
4138	error = try_break_deleg(inode: dir, di);
4139	if (error)
4140	return error;
4141	error = dir->i_op->create(idmap, dir, dentry, mode, true);
4142	if (!error)
4143	fsnotify_create(dir, dentry);
4144	return error;
4145	}
4146	EXPORT_SYMBOL(vfs_create);
4147
4148	int vfs_mkobj(struct dentry *dentry, umode_t mode,
4149	int (f)(struct* dentry , umode_t, void* *),
4150	void *arg)
4151	{
4152	struct inode *dir = dentry->d_parent->d_inode;
4153	int error = may_create(idmap: &nop_mnt_idmap, dir, child: dentry);
4154	if (error)
4155	return error;
4156
4157	mode &= S_IALLUGO;
4158	mode \|= S_IFREG;
4159	error = security_inode_create(dir, dentry, mode);
4160	if (error)
4161	return error;
4162	error = f(dentry, mode, arg);
4163	if (!error)
4164	fsnotify_create(dir, dentry);
4165	return error;
4166	}
4167	EXPORT_SYMBOL(vfs_mkobj);
4168
4169	bool may_open_dev(const struct path *path)
4170	{
4171	return !(path->mnt->mnt_flags & MNT_NODEV) &&
4172	!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
4173	}
4174
4175	static int may_open(struct mnt_idmap idmap, const* struct path *path,
4176	int acc_mode, int flag)
4177	{
4178	struct dentry *dentry = path->dentry;
4179	struct inode *inode = dentry->d_inode;
4180	int error;
4181
4182	if (!inode)
4183	return -ENOENT;
4184
4185	switch (inode->i_mode & S_IFMT) {
4186	case S_IFLNK:
4187	return -ELOOP;
4188	case S_IFDIR:
4189	if (acc_mode & MAY_WRITE)
4190	return -EISDIR;
4191	if (acc_mode & MAY_EXEC)
4192	return -EACCES;
4193	break;
4194	case S_IFBLK:
4195	case S_IFCHR:
4196	if (!may_open_dev(path))
4197	return -EACCES;
4198	fallthrough;
4199	case S_IFIFO:
4200	case S_IFSOCK:
4201	if (acc_mode & MAY_EXEC)
4202	return -EACCES;
4203	flag &= ~O_TRUNC;
4204	break;
4205	case S_IFREG:
4206	if ((acc_mode & MAY_EXEC) && path_noexec(path))
4207	return -EACCES;
4208	break;
4209	default:
4210	VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode);
4211	}
4212
4213	error = inode_permission(idmap, inode, MAY_OPEN \| acc_mode);
4214	if (error)
4215	return error;
4216
4217	/*
4218	* An append-only file must be opened in append mode for writing.
4219	*/
4220	if (IS_APPEND(inode)) {
4221	if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
4222	return -EPERM;
4223	if (flag & O_TRUNC)
4224	return -EPERM;
4225	}
4226
4227	/ O_NOATIME can only be set by the owner or superuser /
4228	if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
4229	return -EPERM;
4230
4231	return `0`;
4232	}
4233
4234	static int handle_truncate(struct mnt_idmap idmap, struct* file *filp)
4235	{
4236	const struct path *path = &filp->f_path;
4237	struct inode *inode = path->dentry->d_inode;
4238	int error = get_write_access(inode);
4239	if (error)
4240	return error;
4241
4242	error = security_file_truncate(file: filp);
4243	if (!error) {
4244	error = do_truncate(idmap, path->dentry, start: `0`,
4245	ATTR_MTIME\|ATTR_CTIME\|ATTR_OPEN,
4246	filp);
4247	}
4248	put_write_access(inode);
4249	return error;
4250	}
4251
4252	static inline int open_to_namei_flags(int flag)
4253	{
4254	if ((flag & O_ACCMODE) == `3`)
4255	flag--;
4256	return flag;
4257	}
4258
4259	static int may_o_create(struct mnt_idmap *idmap,
4260	const struct path dir, struct* dentry *dentry,
4261	umode_t mode)
4262	{
4263	int error = security_path_mknod(dir, dentry, mode, dev: `0`);
4264	if (error)
4265	return error;
4266
4267	if (!fsuidgid_has_mapping(sb: dir->dentry->d_sb, idmap))
4268	return -EOVERFLOW;
4269
4270	error = inode_permission(idmap, dir->dentry->d_inode,
4271	MAY_WRITE \| MAY_EXEC);
4272	if (error)
4273	return error;
4274
4275	return security_inode_create(dir: dir->dentry->d_inode, dentry, mode);
4276	}
4277
4278	/*
4279	* Attempt to atomically look up, create and open a file from a negative
4280	* dentry.
4281	*
4282	* Returns 0 if successful. The file will have been created and attached to
4283	* @file by the filesystem calling finish_open().
4284	*
4285	* If the file was looked up only or didn't need creating, FMODE_OPENED won't
4286	* be set. The caller will need to perform the open themselves. @path will
4287	* have been updated to point to the new dentry. This may be negative.
4288	*
4289	* Returns an error code otherwise.
4290	*/
4291	static struct dentry atomic_open(struct* nameidata nd, struct* dentry *dentry,
4292	struct file *file,
4293	int open_flag, umode_t mode)
4294	{
4295	struct dentry *const DENTRY_NOT_SET = (void *) -`1UL`;
4296	struct inode *dir = nd->path.dentry->d_inode;
4297	int error;
4298
4299	if (nd->flags & LOOKUP_DIRECTORY)
4300	open_flag \|= O_DIRECTORY;
4301
4302	file->__f_path.dentry = DENTRY_NOT_SET;
4303	file->__f_path.mnt = nd->path.mnt;
4304	error = dir->i_op->atomic_open(dir, dentry, file,
4305	open_to_namei_flags(flag: open_flag), mode);
4306	d_lookup_done(dentry);
4307	if (!error) {
4308	if (file->f_mode & FMODE_OPENED) {
4309	if (unlikely(dentry != file->f_path.dentry)) {
4310	dput(dentry);
4311	dentry = dget(dentry: file->f_path.dentry);
4312	}
4313	} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
4314	error = -EIO;
4315	} else {
4316	if (file->f_path.dentry) {
4317	dput(dentry);
4318	dentry = file->f_path.dentry;
4319	}
4320	if (unlikely(d_is_negative(dentry)))
4321	error = -ENOENT;
4322	}
4323	}
4324	if (error) {
4325	dput(dentry);
4326	dentry = ERR_PTR(error);
4327	}
4328	return dentry;
4329	}
4330
4331	/*
4332	* Look up and maybe create and open the last component.
4333	*
4334	* Must be called with parent locked (exclusive in O_CREAT case).
4335	*
4336	* Returns 0 on success, that is, if
4337	* the file was successfully atomically created (if necessary) and opened, or
4338	* the file was not completely opened at this time, though lookups and
4339	* creations were performed.
4340	* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
4341	* In the latter case dentry returned in @path might be negative if O_CREAT
4342	* hadn't been specified.
4343	*
4344	* An error code is returned on failure.
4345	*/
4346	static struct dentry lookup_open(struct* nameidata nd, struct* file *file,
4347	const struct open_flags *op,
4348	bool got_write, struct delegated_inode *delegated_inode)
4349	{
4350	struct mnt_idmap *idmap;
4351	struct dentry *dir = nd->path.dentry;
4352	struct inode *dir_inode = dir->d_inode;
4353	int open_flag = op->open_flag;
4354	struct dentry *dentry;
4355	int error, create_error = `0`;
4356	umode_t mode = op->mode;
4357	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
4358
4359	if (unlikely(IS_DEADDIR(dir_inode)))
4360	return ERR_PTR(error: -ENOENT);
4361
4362	file->f_mode &= ~FMODE_CREATED;
4363	dentry = d_lookup(dir, &nd->last);
4364	for (;;) {
4365	if (!dentry) {
4366	dentry = d_alloc_parallel(dir, &nd->last, &wq);
4367	if (IS_ERR(ptr: dentry))
4368	return dentry;
4369	}
4370	if (d_in_lookup(dentry))
4371	break;
4372
4373	error = d_revalidate(dir: dir_inode, name: &nd->last, dentry, flags: nd->flags);
4374	if (likely(error > `0`))
4375	break;
4376	if (error)
4377	goto out_dput;
4378	d_invalidate(dentry);
4379	dput(dentry);
4380	dentry = NULL;
4381	}
4382	if (dentry->d_inode) {
4383	/ Cached positive dentry: will open in f_op->open /
4384	return dentry;
4385	}
4386
4387	if (open_flag & O_CREAT)
4388	audit_inode(name: nd->name, dentry: dir, AUDIT_INODE_PARENT);
4389
4390	/*
4391	* Checking write permission is tricky, bacuse we don't know if we are
4392	* going to actually need it: O_CREAT opens should work as long as the
4393	* file exists. But checking existence breaks atomicity. The trick is
4394	* to check access and if not granted clear O_CREAT from the flags.
4395	*
4396	* Another problem is returing the "right" error value (e.g. for an
4397	* O_EXCL open we want to return EEXIST not EROFS).
4398	*/
4399	if (unlikely(!got_write))
4400	open_flag &= ~O_TRUNC;
4401	idmap = mnt_idmap(mnt: nd->path.mnt);
4402	if (open_flag & O_CREAT) {
4403	if (open_flag & O_EXCL)
4404	open_flag &= ~O_TRUNC;
4405	mode = vfs_prepare_mode(idmap, dir: dir->d_inode, mode, mask_perms: mode, type: mode);
4406	if (likely(got_write))
4407	create_error = may_o_create(idmap, dir: &nd->path,
4408	dentry, mode);
4409	else
4410	create_error = -EROFS;
4411	}
4412	if (create_error)
4413	open_flag &= ~O_CREAT;
4414	if (dir_inode->i_op->atomic_open) {
4415	dentry = atomic_open(nd, dentry, file, open_flag, mode);
4416	if (unlikely(create_error) && dentry == ERR_PTR(error: -ENOENT))
4417	dentry = ERR_PTR(error: create_error);
4418	return dentry;
4419	}
4420
4421	if (d_in_lookup(dentry)) {
4422	struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
4423	nd->flags);
4424	d_lookup_done(dentry);
4425	if (unlikely(res)) {
4426	if (IS_ERR(ptr: res)) {
4427	error = PTR_ERR(ptr: res);
4428	goto out_dput;
4429	}
4430	dput(dentry);
4431	dentry = res;
4432	}
4433	}
4434
4435	/ Negative dentry, just create the file /
4436	if (!dentry->d_inode && (open_flag & O_CREAT)) {
4437	/ but break the directory lease first! /
4438	error = try_break_deleg(inode: dir_inode, di: delegated_inode);
4439	if (error)
4440	goto out_dput;
4441
4442	file->f_mode \|= FMODE_CREATED;
4443	audit_inode_child(parent: dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
4444	if (!dir_inode->i_op->create) {
4445	error = -EACCES;
4446	goto out_dput;
4447	}
4448
4449	error = dir_inode->i_op->create(idmap, dir_inode, dentry,
4450	mode, open_flag & O_EXCL);
4451	if (error)
4452	goto out_dput;
4453	}
4454	if (unlikely(create_error) && !dentry->d_inode) {
4455	error = create_error;
4456	goto out_dput;
4457	}
4458	return dentry;
4459
4460	out_dput:
4461	dput(dentry);
4462	return ERR_PTR(error);
4463	}
4464
4465	static inline bool trailing_slashes(struct nameidata *nd)
4466	{
4467	return (bool)nd->last.name[nd->last.len];
4468	}
4469
4470	static struct dentry lookup_fast_for_open(struct* nameidata nd, int* open_flag)
4471	{
4472	struct dentry *dentry;
4473
4474	if (open_flag & O_CREAT) {
4475	if (trailing_slashes(nd))
4476	return ERR_PTR(error: -EISDIR);
4477
4478	/ Don't bother on an O_EXCL create /
4479	if (open_flag & O_EXCL)
4480	return NULL;
4481	}
4482
4483	if (trailing_slashes(nd))
4484	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
4485
4486	dentry = lookup_fast(nd);
4487	if (IS_ERR_OR_NULL(ptr: dentry))
4488	return dentry;
4489
4490	if (open_flag & O_CREAT) {
4491	/ Discard negative dentries. Need inode_lock to do the create /
4492	if (!dentry->d_inode) {
4493	if (!(nd->flags & LOOKUP_RCU))
4494	dput(dentry);
4495	dentry = NULL;
4496	}
4497	}
4498	return dentry;
4499	}
4500
4501	static const char open_last_lookups(struct* nameidata *nd,
4502	struct file file, const* struct open_flags *op)
4503	{
4504	struct delegated_inode delegated_inode = { };
4505	struct dentry *dir = nd->path.dentry;
4506	int open_flag = op->open_flag;
4507	bool got_write = false;
4508	struct dentry *dentry;
4509	const char *res;
4510
4511	nd->flags \|= op->intent;
4512
4513	if (nd->last_type != LAST_NORM) {
4514	if (nd->depth)
4515	put_link(nd);
4516	return handle_dots(nd, type: nd->last_type);
4517	}
4518
4519	/ We _can_ be in RCU mode here /
4520	dentry = lookup_fast_for_open(nd, open_flag);
4521	if (IS_ERR(ptr: dentry))
4522	return ERR_CAST(ptr: dentry);
4523
4524	if (likely(dentry))
4525	goto finish_lookup;
4526
4527	if (!(open_flag & O_CREAT)) {
4528	if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
4529	return ERR_PTR(error: -ECHILD);
4530	} else {
4531	if (nd->flags & LOOKUP_RCU) {
4532	if (!try_to_unlazy(nd))
4533	return ERR_PTR(error: -ECHILD);
4534	}
4535	}
4536	retry:
4537	if (open_flag & (O_CREAT \| O_TRUNC \| O_WRONLY \| O_RDWR)) {
4538	got_write = !mnt_want_write(mnt: nd->path.mnt);
4539	/*
4540	* do _not_ fail yet - we might not need that or fail with
4541	* a different error; let lookup_open() decide; we'll be
4542	* dropping this one anyway.
4543	*/
4544	}
4545	if (open_flag & O_CREAT)
4546	inode_lock(inode: dir->d_inode);
4547	else
4548	inode_lock_shared(inode: dir->d_inode);
4549	dentry = lookup_open(nd, file, op, got_write, delegated_inode: &delegated_inode);
4550	if (!IS_ERR(ptr: dentry)) {
4551	if (file->f_mode & FMODE_CREATED)
4552	fsnotify_create(dir: dir->d_inode, dentry);
4553	if (file->f_mode & FMODE_OPENED)
4554	fsnotify_open(file);
4555	}
4556	if (open_flag & O_CREAT)
4557	inode_unlock(inode: dir->d_inode);
4558	else
4559	inode_unlock_shared(inode: dir->d_inode);
4560
4561	if (got_write)
4562	mnt_drop_write(mnt: nd->path.mnt);
4563
4564	if (IS_ERR(ptr: dentry)) {
4565	if (is_delegated(di: &delegated_inode)) {
4566	int error = break_deleg_wait(di: &delegated_inode);
4567
4568	if (!error)
4569	goto retry;
4570	return ERR_PTR(error);
4571	}
4572	return ERR_CAST(ptr: dentry);
4573	}
4574
4575	if (file->f_mode & (FMODE_OPENED \| FMODE_CREATED)) {
4576	dput(nd->path.dentry);
4577	nd->path.dentry = dentry;
4578	return NULL;
4579	}
4580
4581	finish_lookup:
4582	if (nd->depth)
4583	put_link(nd);
4584	res = step_into(nd, flags: WALK_TRAILING, dentry);
4585	if (unlikely(res))
4586	nd->flags &= ~(LOOKUP_OPEN\|LOOKUP_CREATE\|LOOKUP_EXCL);
4587	return res;
4588	}
4589
4590	/*
4591	* Handle the last step of open()
4592	*/
4593	static int do_open(struct nameidata *nd,
4594	struct file file, const* struct open_flags *op)
4595	{
4596	struct mnt_idmap *idmap;
4597	int open_flag = op->open_flag;
4598	bool do_truncate;
4599	int acc_mode;
4600	int error;
4601
4602	if (!(file->f_mode & (FMODE_OPENED \| FMODE_CREATED))) {
4603	error = complete_walk(nd);
4604	if (error)
4605	return error;
4606	}
4607	if (!(file->f_mode & FMODE_CREATED))
4608	audit_inode(name: nd->name, dentry: nd->path.dentry, aflags: `0`);
4609	idmap = mnt_idmap(mnt: nd->path.mnt);
4610	if (open_flag & O_CREAT) {
4611	if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
4612	return -EEXIST;
4613	if (d_is_dir(dentry: nd->path.dentry))
4614	return -EISDIR;
4615	error = may_create_in_sticky(idmap, nd,
4616	inode: d_backing_inode(upper: nd->path.dentry));
4617	if (unlikely(error))
4618	return error;
4619	}
4620	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(dentry: nd->path.dentry))
4621	return -ENOTDIR;
4622
4623	do_truncate = false;
4624	acc_mode = op->acc_mode;
4625	if (file->f_mode & FMODE_CREATED) {
4626	/ Don't check for write permission, don't truncate /
4627	open_flag &= ~O_TRUNC;
4628	acc_mode = `0`;
4629	} else if (d_is_reg(dentry: nd->path.dentry) && open_flag & O_TRUNC) {
4630	error = mnt_want_write(mnt: nd->path.mnt);
4631	if (error)
4632	return error;
4633	do_truncate = true;
4634	}
4635	error = may_open(idmap, path: &nd->path, acc_mode, flag: open_flag);
4636	if (!error && !(file->f_mode & FMODE_OPENED))
4637	error = vfs_open(&nd->path, file);
4638	if (!error)
4639	error = security_file_post_open(file, mask: op->acc_mode);
4640	if (!error && do_truncate)
4641	error = handle_truncate(idmap, filp: file);
4642	if (unlikely(error > `0`)) {
4643	WARN_ON(`1`);
4644	error = -EINVAL;
4645	}
4646	if (do_truncate)
4647	mnt_drop_write(mnt: nd->path.mnt);
4648	return error;
4649	}
4650
4651	/**
4652	* vfs_tmpfile - create tmpfile
4653	* @idmap: idmap of the mount the inode was found from
4654	* @parentpath: pointer to the path of the base directory
4655	* @file: file descriptor of the new tmpfile
4656	* @mode: mode of the new tmpfile
4657	*
4658	* Create a temporary file.
4659	*
4660	* If the inode has been found through an idmapped mount the idmap of
4661	* the vfsmount must be passed through @idmap. This function will then take
4662	* care to map the inode according to @idmap before checking permissions.
4663	* On non-idmapped mounts or if permission checking is to be performed on the
4664	* raw inode simply pass @nop_mnt_idmap.
4665	*/
4666	int vfs_tmpfile(struct mnt_idmap *idmap,
4667	const struct path *parentpath,
4668	struct file *file, umode_t mode)
4669	{
4670	struct dentry *child;
4671	struct inode *dir = d_inode(dentry: parentpath->dentry);
4672	struct inode *inode;
4673	int error;
4674	int open_flag = file->f_flags;
4675
4676	/ we want directory to be writable /
4677	error = inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
4678	if (error)
4679	return error;
4680	if (!dir->i_op->tmpfile)
4681	return -EOPNOTSUPP;
4682	child = d_alloc(parentpath->dentry, &slash_name);
4683	if (unlikely(!child))
4684	return -ENOMEM;
4685	file->__f_path.mnt = parentpath->mnt;
4686	file->__f_path.dentry = child;
4687	mode = vfs_prepare_mode(idmap, dir, mode, mask_perms: mode, type: mode);
4688	error = dir->i_op->tmpfile(idmap, dir, file, mode);
4689	dput(child);
4690	if (file->f_mode & FMODE_OPENED)
4691	fsnotify_open(file);
4692	if (error)
4693	return error;
4694	/ Don't check for other permissions, the inode was just created /
4695	error = may_open(idmap, path: &file->f_path, acc_mode: `0`, flag: file->f_flags);
4696	if (error)
4697	return error;
4698	inode = file_inode(f: file);
4699	if (!(open_flag & O_EXCL)) {
4700	spin_lock(lock: &inode->i_lock);
4701	inode_state_set(inode, flags: I_LINKABLE);
4702	spin_unlock(lock: &inode->i_lock);
4703	}
4704	security_inode_post_create_tmpfile(idmap, inode);
4705	return `0`;
4706	}
4707
4708	/**
4709	* kernel_tmpfile_open - open a tmpfile for kernel internal use
4710	* @idmap: idmap of the mount the inode was found from
4711	* @parentpath: path of the base directory
4712	* @mode: mode of the new tmpfile
4713	* @open_flag: flags
4714	* @cred: credentials for open
4715	*
4716	* Create and open a temporary file. The file is not accounted in nr_files,
4717	* hence this is only for kernel internal use, and must not be installed into
4718	* file tables or such.
4719	*/
4720	struct file kernel_tmpfile_open(struct* mnt_idmap *idmap,
4721	const struct path *parentpath,
4722	umode_t mode, int open_flag,
4723	const struct cred *cred)
4724	{
4725	struct file *file;
4726	int error;
4727
4728	file = alloc_empty_file_noaccount(flags: open_flag, cred);
4729	if (IS_ERR(ptr: file))
4730	return file;
4731
4732	error = vfs_tmpfile(idmap, parentpath, file, mode);
4733	if (error) {
4734	fput(file);
4735	file = ERR_PTR(error);
4736	}
4737	return file;
4738	}
4739	EXPORT_SYMBOL(kernel_tmpfile_open);
4740
4741	static int do_tmpfile(struct nameidata nd, unsigned* flags,
4742	const struct open_flags *op,
4743	struct file *file)
4744	{
4745	struct path path;
4746	int error = path_lookupat(nd, flags: flags \| LOOKUP_DIRECTORY, path: &path);
4747
4748	if (unlikely(error))
4749	return error;
4750	error = mnt_want_write(mnt: path.mnt);
4751	if (unlikely(error))
4752	goto out;
4753	error = vfs_tmpfile(idmap: mnt_idmap(mnt: path.mnt), parentpath: &path, file, mode: op->mode);
4754	if (error)
4755	goto out2;
4756	audit_inode(name: nd->name, dentry: file->f_path.dentry, aflags: `0`);
4757	out2:
4758	mnt_drop_write(mnt: path.mnt);
4759	out:
4760	path_put(&path);
4761	return error;
4762	}
4763
4764	static int do_o_path(struct nameidata nd, unsigned* flags, struct file *file)
4765	{
4766	struct path path;
4767	int error = path_lookupat(nd, flags, path: &path);
4768	if (!error) {
4769	audit_inode(name: nd->name, dentry: path.dentry, aflags: `0`);
4770	error = vfs_open(&path, file);
4771	path_put(&path);
4772	}
4773	return error;
4774	}
4775
4776	static struct file path_openat(struct* nameidata *nd,
4777	const struct open_flags op, unsigned* flags)
4778	{
4779	struct file *file;
4780	int error;
4781
4782	file = alloc_empty_file(flags: op->open_flag, current_cred());
4783	if (IS_ERR(ptr: file))
4784	return file;
4785
4786	if (unlikely(file->f_flags & __O_TMPFILE)) {
4787	error = do_tmpfile(nd, flags, op, file);
4788	} else if (unlikely(file->f_flags & O_PATH)) {
4789	error = do_o_path(nd, flags, file);
4790	} else {
4791	const char *s = path_init(nd, flags);
4792	while (!(error = link_path_walk(name: s, nd)) &&
4793	(s = open_last_lookups(nd, file, op)) != NULL)
4794	;
4795	if (!error)
4796	error = do_open(nd, file, op);
4797	terminate_walk(nd);
4798	}
4799	if (likely(!error)) {
4800	if (likely(file->f_mode & FMODE_OPENED))
4801	return file;
4802	WARN_ON(`1`);
4803	error = -EINVAL;
4804	}
4805	fput_close(file);
4806	if (error == -EOPENSTALE) {
4807	if (flags & LOOKUP_RCU)
4808	error = -ECHILD;
4809	else
4810	error = -ESTALE;
4811	}
4812	return ERR_PTR(error);
4813	}
4814
4815	struct file do_filp_open(int* dfd, struct filename *pathname,
4816	const struct open_flags *op)
4817	{
4818	struct nameidata nd;
4819	int flags = op->lookup_flags;
4820	struct file *filp;
4821
4822	set_nameidata(p: &nd, dfd, name: pathname, NULL);
4823	filp = path_openat(nd: &nd, op, flags: flags \| LOOKUP_RCU);
4824	if (unlikely(filp == ERR_PTR(-ECHILD)))
4825	filp = path_openat(nd: &nd, op, flags);
4826	if (unlikely(filp == ERR_PTR(-ESTALE)))
4827	filp = path_openat(nd: &nd, op, flags: flags \| LOOKUP_REVAL);
4828	restore_nameidata();
4829	return filp;
4830	}
4831
4832	struct file do_file_open_root(const* struct path *root,
4833	const char name, const* struct open_flags *op)
4834	{
4835	struct nameidata nd;
4836	struct file *file;
4837	struct filename *filename;
4838	int flags = op->lookup_flags;
4839
4840	if (d_is_symlink(dentry: root->dentry) && op->intent & LOOKUP_OPEN)
4841	return ERR_PTR(error: -ELOOP);
4842
4843	filename = getname_kernel(name);
4844	if (IS_ERR(ptr: filename))
4845	return ERR_CAST(ptr: filename);
4846
4847	set_nameidata(p: &nd, dfd: -`1`, name: filename, root);
4848	file = path_openat(nd: &nd, op, flags: flags \| LOOKUP_RCU);
4849	if (unlikely(file == ERR_PTR(-ECHILD)))
4850	file = path_openat(nd: &nd, op, flags);
4851	if (unlikely(file == ERR_PTR(-ESTALE)))
4852	file = path_openat(nd: &nd, op, flags: flags \| LOOKUP_REVAL);
4853	restore_nameidata();
4854	putname(filename);
4855	return file;
4856	}
4857
4858	static struct dentry filename_create(int* dfd, struct filename *name,
4859	struct path path, unsigned* int lookup_flags)
4860	{
4861	struct dentry *dentry = ERR_PTR(error: -EEXIST);
4862	struct qstr last;
4863	bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
4864	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
4865	unsigned int create_flags = LOOKUP_CREATE \| LOOKUP_EXCL;
4866	int type;
4867	int error;
4868
4869	error = filename_parentat(dfd, name, flags: reval_flag, parent: path, last: &last, type: &type);
4870	if (error)
4871	return ERR_PTR(error);
4872
4873	/*
4874	* Yucky last component or no last component at all?
4875	* (foo/., foo/.., /////)
4876	*/
4877	if (unlikely(type != LAST_NORM))
4878	goto out;
4879
4880	/ don't fail immediately if it's r/o, at least try to report other errors /
4881	error = mnt_want_write(mnt: path->mnt);
4882	/*
4883	* Do the final lookup. Suppress 'create' if there is a trailing
4884	* '/', and a directory wasn't requested.
4885	*/
4886	if (last.name[last.len] && !want_dir)
4887	create_flags &= ~LOOKUP_CREATE;
4888	dentry = start_dirop(parent: path->dentry, name: &last, lookup_flags: reval_flag \| create_flags);
4889	if (IS_ERR(ptr: dentry))
4890	goto out_drop_write;
4891
4892	if (unlikely(error))
4893	goto fail;
4894
4895	return dentry;
4896	fail:
4897	end_dirop(dentry);
4898	dentry = ERR_PTR(error);
4899	out_drop_write:
4900	if (!error)
4901	mnt_drop_write(mnt: path->mnt);
4902	out:
4903	path_put(path);
4904	return dentry;
4905	}
4906
4907	struct dentry start_creating_path(int* dfd, const char *pathname,
4908	struct path path, unsigned* int lookup_flags)
4909	{
4910	struct filename *filename = getname_kernel(pathname);
4911	struct dentry *res = filename_create(dfd, name: filename, path, lookup_flags);
4912
4913	putname(filename);
4914	return res;
4915	}
4916	EXPORT_SYMBOL(start_creating_path);
4917
4918	/**
4919	* end_creating_path - finish a code section started by start_creating_path()
4920	* @path: the path instantiated by start_creating_path()
4921	* @dentry: the dentry returned by start_creating_path()
4922	*
4923	* end_creating_path() will unlock and locks taken by start_creating_path()
4924	* and drop an references that were taken. It should only be called
4925	* if start_creating_path() returned a non-error.
4926	* If vfs_mkdir() was called and it returned an error, that error should
4927	* be passed to end_creating_path() together with the path.
4928	*/
4929	void end_creating_path(const struct path path, struct* dentry *dentry)
4930	{
4931	end_creating(child: dentry);
4932	mnt_drop_write(mnt: path->mnt);
4933	path_put(path);
4934	}
4935	EXPORT_SYMBOL(end_creating_path);
4936
4937	inline struct dentry *start_creating_user_path(
4938	int dfd, const char __user *pathname,
4939	struct path path, unsigned* int lookup_flags)
4940	{
4941	struct filename *filename = getname(name: pathname);
4942	struct dentry *res = filename_create(dfd, name: filename, path, lookup_flags);
4943
4944	putname(filename);
4945	return res;
4946	}
4947	EXPORT_SYMBOL(start_creating_user_path);
4948
4949
4950	/**
4951	* vfs_mknod - create device node or file
4952	* @idmap: idmap of the mount the inode was found from
4953	* @dir: inode of the parent directory
4954	* @dentry: dentry of the child device node
4955	* @mode: mode of the child device node
4956	* @dev: device number of device to create
4957	* @delegated_inode: returns parent inode, if the inode is delegated.
4958	*
4959	* Create a device node or file.
4960	*
4961	* If the inode has been found through an idmapped mount the idmap of
4962	* the vfsmount must be passed through @idmap. This function will then take
4963	* care to map the inode according to @idmap before checking permissions.
4964	* On non-idmapped mounts or if permission checking is to be performed on the
4965	* raw inode simply pass @nop_mnt_idmap.
4966	*/
4967	int vfs_mknod(struct mnt_idmap idmap, struct* inode *dir,
4968	struct dentry *dentry, umode_t mode, dev_t dev,
4969	struct delegated_inode *delegated_inode)
4970	{
4971	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
4972	int error = may_create(idmap, dir, child: dentry);
4973
4974	if (error)
4975	return error;
4976
4977	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) && !is_whiteout &&
4978	!capable(CAP_MKNOD))
4979	return -EPERM;
4980
4981	if (!dir->i_op->mknod)
4982	return -EPERM;
4983
4984	mode = vfs_prepare_mode(idmap, dir, mode, mask_perms: mode, type: mode);
4985	error = devcgroup_inode_mknod(mode, dev);
4986	if (error)
4987	return error;
4988
4989	error = security_inode_mknod(dir, dentry, mode, dev);
4990	if (error)
4991	return error;
4992
4993	error = try_break_deleg(inode: dir, di: delegated_inode);
4994	if (error)
4995	return error;
4996
4997	error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
4998	if (!error)
4999	fsnotify_create(dir, dentry);
5000	return error;
5001	}
5002	EXPORT_SYMBOL(vfs_mknod);
5003
5004	static int may_mknod(umode_t mode)
5005	{
5006	switch (mode & S_IFMT) {
5007	case S_IFREG:
5008	case S_IFCHR:
5009	case S_IFBLK:
5010	case S_IFIFO:
5011	case S_IFSOCK:
5012	case `0`: / zero mode translates to S_IFREG /
5013	return `0`;
5014	case S_IFDIR:
5015	return -EPERM;
5016	default:
5017	return -EINVAL;
5018	}
5019	}
5020
5021	static int do_mknodat(int dfd, struct filename *name, umode_t mode,
5022	unsigned int dev)
5023	{
5024	struct delegated_inode di = { };
5025	struct mnt_idmap *idmap;
5026	struct dentry *dentry;
5027	struct path path;
5028	int error;
5029	unsigned int lookup_flags = `0`;
5030
5031	error = may_mknod(mode);
5032	if (error)
5033	goto out1;
5034	retry:
5035	dentry = filename_create(dfd, name, path: &path, lookup_flags);
5036	error = PTR_ERR(ptr: dentry);
5037	if (IS_ERR(ptr: dentry))
5038	goto out1;
5039
5040	error = security_path_mknod(dir: &path, dentry,
5041	mode: mode_strip_umask(dir: path.dentry->d_inode, mode), dev);
5042	if (error)
5043	goto out2;
5044
5045	idmap = mnt_idmap(mnt: path.mnt);
5046	switch (mode & S_IFMT) {
5047	case `0`: case S_IFREG:
5048	error = vfs_create(idmap, dentry, mode, &di);
5049	if (!error)
5050	security_path_post_mknod(idmap, dentry);
5051	break;
5052	case S_IFCHR: case S_IFBLK:
5053	error = vfs_mknod(idmap, path.dentry->d_inode,
5054	dentry, mode, new_decode_dev(dev), &di);
5055	break;
5056	case S_IFIFO: case S_IFSOCK:
5057	error = vfs_mknod(idmap, path.dentry->d_inode,
5058	dentry, mode, `0`, &di);
5059	break;
5060	}
5061	out2:
5062	end_creating_path(&path, dentry);
5063	if (is_delegated(di: &di)) {
5064	error = break_deleg_wait(di: &di);
5065	if (!error)
5066	goto retry;
5067	}
5068	if (retry_estale(error, flags: lookup_flags)) {
5069	lookup_flags \|= LOOKUP_REVAL;
5070	goto retry;
5071	}
5072	out1:
5073	putname(name);
5074	return error;
5075	}
5076
5077	SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
5078	unsigned int, dev)
5079	{
5080	return do_mknodat(dfd, name: getname(name: filename), mode, dev);
5081	}
5082
5083	SYSCALL_DEFINE3(mknod, const char __user , filename, umode_t, mode, unsigned*, dev)
5084	{
5085	return do_mknodat(AT_FDCWD, name: getname(name: filename), mode, dev);
5086	}
5087
5088	/**
5089	* vfs_mkdir - create directory returning correct dentry if possible
5090	* @idmap: idmap of the mount the inode was found from
5091	* @dir: inode of the parent directory
5092	* @dentry: dentry of the child directory
5093	* @mode: mode of the child directory
5094	* @delegated_inode: returns parent inode, if the inode is delegated.
5095	*
5096	* Create a directory.
5097	*
5098	* If the inode has been found through an idmapped mount the idmap of
5099	* the vfsmount must be passed through @idmap. This function will then take
5100	* care to map the inode according to @idmap before checking permissions.
5101	* On non-idmapped mounts or if permission checking is to be performed on the
5102	* raw inode simply pass @nop_mnt_idmap.
5103	*
5104	* In the event that the filesystem does not use the *@dentry but leaves it
5105	* negative or unhashes it and possibly splices a different one returning it,
5106	* the original dentry is dput() and the alternate is returned.
5107	*
5108	* In case of an error the dentry is dput() and an ERR_PTR() is returned.
5109	*/
5110	struct dentry vfs_mkdir(struct* mnt_idmap idmap, struct* inode *dir,
5111	struct dentry *dentry, umode_t mode,
5112	struct delegated_inode *delegated_inode)
5113	{
5114	int error;
5115	unsigned max_links = dir->i_sb->s_max_links;
5116	struct dentry *de;
5117
5118	error = may_create(idmap, dir, child: dentry);
5119	if (error)
5120	goto err;
5121
5122	error = -EPERM;
5123	if (!dir->i_op->mkdir)
5124	goto err;
5125
5126	mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO \| S_ISVTX, type: `0`);
5127	error = security_inode_mkdir(dir, dentry, mode);
5128	if (error)
5129	goto err;
5130
5131	error = -EMLINK;
5132	if (max_links && dir->i_nlink >= max_links)
5133	goto err;
5134
5135	error = try_break_deleg(inode: dir, di: delegated_inode);
5136	if (error)
5137	goto err;
5138
5139	de = dir->i_op->mkdir(idmap, dir, dentry, mode);
5140	error = PTR_ERR(ptr: de);
5141	if (IS_ERR(ptr: de))
5142	goto err;
5143	if (de) {
5144	dput(dentry);
5145	dentry = de;
5146	}
5147	fsnotify_mkdir(dir, dentry);
5148	return dentry;
5149
5150	err:
5151	end_creating(child: dentry);
5152	return ERR_PTR(error);
5153	}
5154	EXPORT_SYMBOL(vfs_mkdir);
5155
5156	int do_mkdirat(int dfd, struct filename *name, umode_t mode)
5157	{
5158	struct dentry *dentry;
5159	struct path path;
5160	int error;
5161	unsigned int lookup_flags = LOOKUP_DIRECTORY;
5162	struct delegated_inode delegated_inode = { };
5163
5164	retry:
5165	dentry = filename_create(dfd, name, path: &path, lookup_flags);
5166	error = PTR_ERR(ptr: dentry);
5167	if (IS_ERR(ptr: dentry))
5168	goto out_putname;
5169
5170	error = security_path_mkdir(dir: &path, dentry,
5171	mode: mode_strip_umask(dir: path.dentry->d_inode, mode));
5172	if (!error) {
5173	dentry = vfs_mkdir(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
5174	dentry, mode, &delegated_inode);
5175	if (IS_ERR(ptr: dentry))
5176	error = PTR_ERR(ptr: dentry);
5177	}
5178	end_creating_path(&path, dentry);
5179	if (is_delegated(di: &delegated_inode)) {
5180	error = break_deleg_wait(di: &delegated_inode);
5181	if (!error)
5182	goto retry;
5183	}
5184	if (retry_estale(error, flags: lookup_flags)) {
5185	lookup_flags \|= LOOKUP_REVAL;
5186	goto retry;
5187	}
5188	out_putname:
5189	putname(name);
5190	return error;
5191	}
5192
5193	SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
5194	{
5195	return do_mkdirat(dfd, name: getname(name: pathname), mode);
5196	}
5197
5198	SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
5199	{
5200	return do_mkdirat(AT_FDCWD, name: getname(name: pathname), mode);
5201	}
5202
5203	/**
5204	* vfs_rmdir - remove directory
5205	* @idmap: idmap of the mount the inode was found from
5206	* @dir: inode of the parent directory
5207	* @dentry: dentry of the child directory
5208	* @delegated_inode: returns parent inode, if it's delegated.
5209	*
5210	* Remove a directory.
5211	*
5212	* If the inode has been found through an idmapped mount the idmap of
5213	* the vfsmount must be passed through @idmap. This function will then take
5214	* care to map the inode according to @idmap before checking permissions.
5215	* On non-idmapped mounts or if permission checking is to be performed on the
5216	* raw inode simply pass @nop_mnt_idmap.
5217	*/
5218	int vfs_rmdir(struct mnt_idmap idmap, struct* inode *dir,
5219	struct dentry dentry, struct* delegated_inode *delegated_inode)
5220	{
5221	int error = may_delete(idmap, dir, victim: dentry, isdir: `1`);
5222
5223	if (error)
5224	return error;
5225
5226	if (!dir->i_op->rmdir)
5227	return -EPERM;
5228
5229	dget(dentry);
5230	inode_lock(inode: dentry->d_inode);
5231
5232	error = -EBUSY;
5233	if (is_local_mountpoint(dentry) \|\|
5234	(dentry->d_inode->i_flags & S_KERNEL_FILE))
5235	goto out;
5236
5237	error = security_inode_rmdir(dir, dentry);
5238	if (error)
5239	goto out;
5240
5241	error = try_break_deleg(inode: dir, di: delegated_inode);
5242	if (error)
5243	goto out;
5244
5245	error = dir->i_op->rmdir(dir, dentry);
5246	if (error)
5247	goto out;
5248
5249	shrink_dcache_parent(dentry);
5250	dentry->d_inode->i_flags \|= S_DEAD;
5251	dont_mount(dentry);
5252	detach_mounts(dentry);
5253
5254	out:
5255	inode_unlock(inode: dentry->d_inode);
5256	dput(dentry);
5257	if (!error)
5258	d_delete_notify(dir, dentry);
5259	return error;
5260	}
5261	EXPORT_SYMBOL(vfs_rmdir);
5262
5263	int do_rmdir(int dfd, struct filename *name)
5264	{
5265	int error;
5266	struct dentry *dentry;
5267	struct path path;
5268	struct qstr last;
5269	int type;
5270	unsigned int lookup_flags = `0`;
5271	struct delegated_inode delegated_inode = { };
5272	retry:
5273	error = filename_parentat(dfd, name, flags: lookup_flags, parent: &path, last: &last, type: &type);
5274	if (error)
5275	goto exit1;
5276
5277	switch (type) {
5278	case LAST_DOTDOT:
5279	error = -ENOTEMPTY;
5280	goto exit2;
5281	case LAST_DOT:
5282	error = -EINVAL;
5283	goto exit2;
5284	case LAST_ROOT:
5285	error = -EBUSY;
5286	goto exit2;
5287	}
5288
5289	error = mnt_want_write(mnt: path.mnt);
5290	if (error)
5291	goto exit2;
5292
5293	dentry = start_dirop(parent: path.dentry, name: &last, lookup_flags);
5294	error = PTR_ERR(ptr: dentry);
5295	if (IS_ERR(ptr: dentry))
5296	goto exit3;
5297	error = security_path_rmdir(dir: &path, dentry);
5298	if (error)
5299	goto exit4;
5300	error = vfs_rmdir(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
5301	dentry, &delegated_inode);
5302	exit4:
5303	end_dirop(dentry);
5304	exit3:
5305	mnt_drop_write(mnt: path.mnt);
5306	exit2:
5307	path_put(&path);
5308	if (is_delegated(di: &delegated_inode)) {
5309	error = break_deleg_wait(di: &delegated_inode);
5310	if (!error)
5311	goto retry;
5312	}
5313	if (retry_estale(error, flags: lookup_flags)) {
5314	lookup_flags \|= LOOKUP_REVAL;
5315	goto retry;
5316	}
5317	exit1:
5318	putname(name);
5319	return error;
5320	}
5321
5322	SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
5323	{
5324	return do_rmdir(AT_FDCWD, name: getname(name: pathname));
5325	}
5326
5327	/**
5328	* vfs_unlink - unlink a filesystem object
5329	* @idmap: idmap of the mount the inode was found from
5330	* @dir: parent directory
5331	* @dentry: victim
5332	* @delegated_inode: returns victim inode, if the inode is delegated.
5333	*
5334	* The caller must hold dir->i_rwsem exclusively.
5335	*
5336	* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
5337	* return a reference to the inode in delegated_inode. The caller
5338	* should then break the delegation on that inode and retry. Because
5339	* breaking a delegation may take a long time, the caller should drop
5340	* dir->i_rwsem before doing so.
5341	*
5342	* Alternatively, a caller may pass NULL for delegated_inode. This may
5343	* be appropriate for callers that expect the underlying filesystem not
5344	* to be NFS exported.
5345	*
5346	* If the inode has been found through an idmapped mount the idmap of
5347	* the vfsmount must be passed through @idmap. This function will then take
5348	* care to map the inode according to @idmap before checking permissions.
5349	* On non-idmapped mounts or if permission checking is to be performed on the
5350	* raw inode simply pass @nop_mnt_idmap.
5351	*/
5352	int vfs_unlink(struct mnt_idmap idmap, struct* inode *dir,
5353	struct dentry dentry, struct* delegated_inode *delegated_inode)
5354	{
5355	struct inode *target = dentry->d_inode;
5356	int error = may_delete(idmap, dir, victim: dentry, isdir: `0`);
5357
5358	if (error)
5359	return error;
5360
5361	if (!dir->i_op->unlink)
5362	return -EPERM;
5363
5364	inode_lock(inode: target);
5365	if (IS_SWAPFILE(target))
5366	error = -EPERM;
5367	else if (is_local_mountpoint(dentry))
5368	error = -EBUSY;
5369	else {
5370	error = security_inode_unlink(dir, dentry);
5371	if (!error) {
5372	error = try_break_deleg(inode: dir, di: delegated_inode);
5373	if (error)
5374	goto out;
5375	error = try_break_deleg(inode: target, di: delegated_inode);
5376	if (error)
5377	goto out;
5378	error = dir->i_op->unlink(dir, dentry);
5379	if (!error) {
5380	dont_mount(dentry);
5381	detach_mounts(dentry);
5382	}
5383	}
5384	}
5385	out:
5386	inode_unlock(inode: target);
5387
5388	/ We don't d_delete() NFS sillyrenamed files--they still exist. /
5389	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
5390	fsnotify_unlink(dir, dentry);
5391	} else if (!error) {
5392	fsnotify_link_count(inode: target);
5393	d_delete_notify(dir, dentry);
5394	}
5395
5396	return error;
5397	}
5398	EXPORT_SYMBOL(vfs_unlink);
5399
5400	/*
5401	* Make sure that the actual truncation of the file will occur outside its
5402	* directory's i_rwsem. Truncate can take a long time if there is a lot of
5403	* writeout happening, and we don't want to prevent access to the directory
5404	* while waiting on the I/O.
5405	*/
5406	int do_unlinkat(int dfd, struct filename *name)
5407	{
5408	int error;
5409	struct dentry *dentry;
5410	struct path path;
5411	struct qstr last;
5412	int type;
5413	struct inode *inode;
5414	struct delegated_inode delegated_inode = { };
5415	unsigned int lookup_flags = `0`;
5416	retry:
5417	error = filename_parentat(dfd, name, flags: lookup_flags, parent: &path, last: &last, type: &type);
5418	if (error)
5419	goto exit_putname;
5420
5421	error = -EISDIR;
5422	if (type != LAST_NORM)
5423	goto exit_path_put;
5424
5425	error = mnt_want_write(mnt: path.mnt);
5426	if (error)
5427	goto exit_path_put;
5428	retry_deleg:
5429	dentry = start_dirop(parent: path.dentry, name: &last, lookup_flags);
5430	error = PTR_ERR(ptr: dentry);
5431	if (IS_ERR(ptr: dentry))
5432	goto exit_drop_write;
5433
5434	/ Why not before? Because we want correct error value /
5435	if (unlikely(last.name[last.len])) {
5436	if (d_is_dir(dentry))
5437	error = -EISDIR;
5438	else
5439	error = -ENOTDIR;
5440	end_dirop(dentry);
5441	goto exit_drop_write;
5442	}
5443	inode = dentry->d_inode;
5444	ihold(inode);
5445	error = security_path_unlink(dir: &path, dentry);
5446	if (error)
5447	goto exit_end_dirop;
5448	error = vfs_unlink(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
5449	dentry, &delegated_inode);
5450	exit_end_dirop:
5451	end_dirop(dentry);
5452	iput(inode); / truncate the inode here /
5453	if (is_delegated(di: &delegated_inode)) {
5454	error = break_deleg_wait(di: &delegated_inode);
5455	if (!error)
5456	goto retry_deleg;
5457	}
5458	exit_drop_write:
5459	mnt_drop_write(mnt: path.mnt);
5460	exit_path_put:
5461	path_put(&path);
5462	if (retry_estale(error, flags: lookup_flags)) {
5463	lookup_flags \|= LOOKUP_REVAL;
5464	goto retry;
5465	}
5466	exit_putname:
5467	putname(name);
5468	return error;
5469	}
5470
5471	SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user , pathname, int*, flag)
5472	{
5473	if ((flag & ~AT_REMOVEDIR) != `0`)
5474	return -EINVAL;
5475
5476	if (flag & AT_REMOVEDIR)
5477	return do_rmdir(dfd, name: getname(name: pathname));
5478	return do_unlinkat(dfd, name: getname(name: pathname));
5479	}
5480
5481	SYSCALL_DEFINE1(unlink, const char __user *, pathname)
5482	{
5483	return do_unlinkat(AT_FDCWD, name: getname(name: pathname));
5484	}
5485
5486	/**
5487	* vfs_symlink - create symlink
5488	* @idmap: idmap of the mount the inode was found from
5489	* @dir: inode of the parent directory
5490	* @dentry: dentry of the child symlink file
5491	* @oldname: name of the file to link to
5492	* @delegated_inode: returns victim inode, if the inode is delegated.
5493	*
5494	* Create a symlink.
5495	*
5496	* If the inode has been found through an idmapped mount the idmap of
5497	* the vfsmount must be passed through @idmap. This function will then take
5498	* care to map the inode according to @idmap before checking permissions.
5499	* On non-idmapped mounts or if permission checking is to be performed on the
5500	* raw inode simply pass @nop_mnt_idmap.
5501	*/
5502	int vfs_symlink(struct mnt_idmap idmap, struct* inode *dir,
5503	struct dentry dentry, const* char *oldname,
5504	struct delegated_inode *delegated_inode)
5505	{
5506	int error;
5507
5508	error = may_create(idmap, dir, child: dentry);
5509	if (error)
5510	return error;
5511
5512	if (!dir->i_op->symlink)
5513	return -EPERM;
5514
5515	error = security_inode_symlink(dir, dentry, old_name: oldname);
5516	if (error)
5517	return error;
5518
5519	error = try_break_deleg(inode: dir, di: delegated_inode);
5520	if (error)
5521	return error;
5522
5523	error = dir->i_op->symlink(idmap, dir, dentry, oldname);
5524	if (!error)
5525	fsnotify_create(dir, dentry);
5526	return error;
5527	}
5528	EXPORT_SYMBOL(vfs_symlink);
5529
5530	int do_symlinkat(struct filename from, int* newdfd, struct filename *to)
5531	{
5532	int error;
5533	struct dentry *dentry;
5534	struct path path;
5535	unsigned int lookup_flags = `0`;
5536	struct delegated_inode delegated_inode = { };
5537
5538	if (IS_ERR(ptr: from)) {
5539	error = PTR_ERR(ptr: from);
5540	goto out_putnames;
5541	}
5542	retry:
5543	dentry = filename_create(dfd: newdfd, name: to, path: &path, lookup_flags);
5544	error = PTR_ERR(ptr: dentry);
5545	if (IS_ERR(ptr: dentry))
5546	goto out_putnames;
5547
5548	error = security_path_symlink(dir: &path, dentry, old_name: from->name);
5549	if (!error)
5550	error = vfs_symlink(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
5551	dentry, from->name, &delegated_inode);
5552	end_creating_path(&path, dentry);
5553	if (is_delegated(di: &delegated_inode)) {
5554	error = break_deleg_wait(di: &delegated_inode);
5555	if (!error)
5556	goto retry;
5557	}
5558	if (retry_estale(error, flags: lookup_flags)) {
5559	lookup_flags \|= LOOKUP_REVAL;
5560	goto retry;
5561	}
5562	out_putnames:
5563	putname(to);
5564	putname(from);
5565	return error;
5566	}
5567
5568	SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
5569	int, newdfd, const char __user *, newname)
5570	{
5571	return do_symlinkat(from: getname(name: oldname), newdfd, to: getname(name: newname));
5572	}
5573
5574	SYSCALL_DEFINE2(symlink, const char __user , oldname, const* char __user *, newname)
5575	{
5576	return do_symlinkat(from: getname(name: oldname), AT_FDCWD, to: getname(name: newname));
5577	}
5578
5579	/**
5580	* vfs_link - create a new link
5581	* @old_dentry: object to be linked
5582	* @idmap: idmap of the mount
5583	* @dir: new parent
5584	* @new_dentry: where to create the new link
5585	* @delegated_inode: returns inode needing a delegation break
5586	*
5587	* The caller must hold dir->i_rwsem exclusively.
5588	*
5589	* If vfs_link discovers a delegation on the to-be-linked file in need
5590	* of breaking, it will return -EWOULDBLOCK and return a reference to the
5591	* inode in delegated_inode. The caller should then break the delegation
5592	* and retry. Because breaking a delegation may take a long time, the
5593	* caller should drop the i_rwsem before doing so.
5594	*
5595	* Alternatively, a caller may pass NULL for delegated_inode. This may
5596	* be appropriate for callers that expect the underlying filesystem not
5597	* to be NFS exported.
5598	*
5599	* If the inode has been found through an idmapped mount the idmap of
5600	* the vfsmount must be passed through @idmap. This function will then take
5601	* care to map the inode according to @idmap before checking permissions.
5602	* On non-idmapped mounts or if permission checking is to be performed on the
5603	* raw inode simply pass @nop_mnt_idmap.
5604	*/
5605	int vfs_link(struct dentry old_dentry, struct* mnt_idmap *idmap,
5606	struct inode dir, struct* dentry *new_dentry,
5607	struct delegated_inode *delegated_inode)
5608	{
5609	struct inode *inode = old_dentry->d_inode;
5610	unsigned max_links = dir->i_sb->s_max_links;
5611	int error;
5612
5613	if (!inode)
5614	return -ENOENT;
5615
5616	error = may_create(idmap, dir, child: new_dentry);
5617	if (error)
5618	return error;
5619
5620	if (dir->i_sb != inode->i_sb)
5621	return -EXDEV;
5622
5623	/*
5624	* A link to an append-only or immutable file cannot be created.
5625	*/
5626	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
5627	return -EPERM;
5628	/*
5629	* Updating the link count will likely cause i_uid and i_gid to
5630	* be written back improperly if their true value is unknown to
5631	* the vfs.
5632	*/
5633	if (HAS_UNMAPPED_ID(idmap, inode))
5634	return -EPERM;
5635	if (!dir->i_op->link)
5636	return -EPERM;
5637	if (S_ISDIR(inode->i_mode))
5638	return -EPERM;
5639
5640	error = security_inode_link(old_dentry, dir, new_dentry);
5641	if (error)
5642	return error;
5643
5644	inode_lock(inode);
5645	/ Make sure we don't allow creating hardlink to an unlinked file /
5646	if (inode->i_nlink == `0` && !(inode_state_read_once(inode) & I_LINKABLE))
5647	error = -ENOENT;
5648	else if (max_links && inode->i_nlink >= max_links)
5649	error = -EMLINK;
5650	else {
5651	error = try_break_deleg(inode: dir, di: delegated_inode);
5652	if (!error)
5653	error = try_break_deleg(inode, di: delegated_inode);
5654	if (!error)
5655	error = dir->i_op->link(old_dentry, dir, new_dentry);
5656	}
5657
5658	if (!error && (inode_state_read_once(inode) & I_LINKABLE)) {
5659	spin_lock(lock: &inode->i_lock);
5660	inode_state_clear(inode, flags: I_LINKABLE);
5661	spin_unlock(lock: &inode->i_lock);
5662	}
5663	inode_unlock(inode);
5664	if (!error)
5665	fsnotify_link(dir, inode, new_dentry);
5666	return error;
5667	}
5668	EXPORT_SYMBOL(vfs_link);
5669
5670	/*
5671	* Hardlinks are often used in delicate situations. We avoid
5672	* security-related surprises by not following symlinks on the
5673	* newname. --KAB
5674	*
5675	* We don't follow them on the oldname either to be compatible
5676	* with linux 2.0, and to avoid hard-linking to directories
5677	* and other special files. --ADM
5678	*/
5679	int do_linkat(int olddfd, struct filename old, int* newdfd,
5680	struct filename new, int* flags)
5681	{
5682	struct mnt_idmap *idmap;
5683	struct dentry *new_dentry;
5684	struct path old_path, new_path;
5685	struct delegated_inode delegated_inode = { };
5686	int how = `0`;
5687	int error;
5688
5689	if ((flags & ~(AT_SYMLINK_FOLLOW \| AT_EMPTY_PATH)) != `0`) {
5690	error = -EINVAL;
5691	goto out_putnames;
5692	}
5693	/*
5694	* To use null names we require CAP_DAC_READ_SEARCH or
5695	* that the open-time creds of the dfd matches current.
5696	* This ensures that not everyone will be able to create
5697	* a hardlink using the passed file descriptor.
5698	*/
5699	if (flags & AT_EMPTY_PATH)
5700	how \|= LOOKUP_LINKAT_EMPTY;
5701
5702	if (flags & AT_SYMLINK_FOLLOW)
5703	how \|= LOOKUP_FOLLOW;
5704	retry:
5705	error = filename_lookup(dfd: olddfd, name: old, flags: how, path: &old_path, NULL);
5706	if (error)
5707	goto out_putnames;
5708
5709	new_dentry = filename_create(dfd: newdfd, name: new, path: &new_path,
5710	lookup_flags: (how & LOOKUP_REVAL));
5711	error = PTR_ERR(ptr: new_dentry);
5712	if (IS_ERR(ptr: new_dentry))
5713	goto out_putpath;
5714
5715	error = -EXDEV;
5716	if (old_path.mnt != new_path.mnt)
5717	goto out_dput;
5718	idmap = mnt_idmap(mnt: new_path.mnt);
5719	error = may_linkat(idmap, link: &old_path);
5720	if (unlikely(error))
5721	goto out_dput;
5722	error = security_path_link(old_dentry: old_path.dentry, new_dir: &new_path, new_dentry);
5723	if (error)
5724	goto out_dput;
5725	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
5726	new_dentry, &delegated_inode);
5727	out_dput:
5728	end_creating_path(&new_path, new_dentry);
5729	if (is_delegated(di: &delegated_inode)) {
5730	error = break_deleg_wait(di: &delegated_inode);
5731	if (!error) {
5732	path_put(&old_path);
5733	goto retry;
5734	}
5735	}
5736	if (retry_estale(error, flags: how)) {
5737	path_put(&old_path);
5738	how \|= LOOKUP_REVAL;
5739	goto retry;
5740	}
5741	out_putpath:
5742	path_put(&old_path);
5743	out_putnames:
5744	putname(old);
5745	putname(new);
5746
5747	return error;
5748	}
5749
5750	SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
5751	int, newdfd, const char __user , newname, int*, flags)
5752	{
5753	return do_linkat(olddfd, old: getname_uflags(filename: oldname, uflags: flags),
5754	newdfd, new: getname(name: newname), flags);
5755	}
5756
5757	SYSCALL_DEFINE2(link, const char __user , oldname, const* char __user *, newname)
5758	{
5759	return do_linkat(AT_FDCWD, old: getname(name: oldname), AT_FDCWD, new: getname(name: newname), flags: `0`);
5760	}
5761
5762	/**
5763	* vfs_rename - rename a filesystem object
5764	* @rd: pointer to &struct renamedata info
5765	*
5766	* The caller must hold multiple mutexes--see lock_rename()).
5767	*
5768	* If vfs_rename discovers a delegation in need of breaking at either
5769	* the source or destination, it will return -EWOULDBLOCK and return a
5770	* reference to the inode in delegated_inode. The caller should then
5771	* break the delegation and retry. Because breaking a delegation may
5772	* take a long time, the caller should drop all locks before doing
5773	* so.
5774	*
5775	* Alternatively, a caller may pass NULL for delegated_inode. This may
5776	* be appropriate for callers that expect the underlying filesystem not
5777	* to be NFS exported.
5778	*
5779	* The worst of all namespace operations - renaming directory. "Perverted"
5780	* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
5781	* Problems:
5782	*
5783	* a) we can get into loop creation.
5784	* b) race potential - two innocent renames can create a loop together.
5785	* That's where 4.4BSD screws up. Current fix: serialization on
5786	* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
5787	* story.
5788	* c) we may have to lock up to _four_ objects - parents and victim (if it exists),
5789	* and source (if it's a non-directory or a subdirectory that moves to
5790	* different parent).
5791	* And that - after we got ->i_rwsem on parents (until then we don't know
5792	* whether the target exists). Solution: try to be smart with locking
5793	* order for inodes. We rely on the fact that tree topology may change
5794	* only under ->s_vfs_rename_mutex _and_ that parent of the object we
5795	* move will be locked. Thus we can rank directories by the tree
5796	* (ancestors first) and rank all non-directories after them.
5797	* That works since everybody except rename does "lock parent, lookup,
5798	* lock child" and rename is under ->s_vfs_rename_mutex.
5799	* HOWEVER, it relies on the assumption that any object with ->lookup()
5800	* has no more than 1 dentry. If "hybrid" objects will ever appear,
5801	* we'd better make sure that there's no link(2) for them.
5802	* d) conversion from fhandle to dentry may come in the wrong moment - when
5803	* we are removing the target. Solution: we will have to grab ->i_rwsem
5804	* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
5805	* ->i_rwsem on parents, which works but leads to some truly excessive
5806	* locking].
5807	*/
5808	int vfs_rename(struct renamedata *rd)
5809	{
5810	int error;
5811	struct inode *old_dir = d_inode(dentry: rd->old_parent);
5812	struct inode *new_dir = d_inode(dentry: rd->new_parent);
5813	struct dentry *old_dentry = rd->old_dentry;
5814	struct dentry *new_dentry = rd->new_dentry;
5815	struct delegated_inode *delegated_inode = rd->delegated_inode;
5816	unsigned int flags = rd->flags;
5817	bool is_dir = d_is_dir(dentry: old_dentry);
5818	struct inode *source = old_dentry->d_inode;
5819	struct inode *target = new_dentry->d_inode;
5820	bool new_is_dir = false;
5821	unsigned max_links = new_dir->i_sb->s_max_links;
5822	struct name_snapshot old_name;
5823	bool lock_old_subdir, lock_new_subdir;
5824
5825	if (source == target)
5826	return `0`;
5827
5828	error = may_delete(idmap: rd->mnt_idmap, dir: old_dir, victim: old_dentry, isdir: is_dir);
5829	if (error)
5830	return error;
5831
5832	if (!target) {
5833	error = may_create(idmap: rd->mnt_idmap, dir: new_dir, child: new_dentry);
5834	} else {
5835	new_is_dir = d_is_dir(dentry: new_dentry);
5836
5837	if (!(flags & RENAME_EXCHANGE))
5838	error = may_delete(idmap: rd->mnt_idmap, dir: new_dir,
5839	victim: new_dentry, isdir: is_dir);
5840	else
5841	error = may_delete(idmap: rd->mnt_idmap, dir: new_dir,
5842	victim: new_dentry, isdir: new_is_dir);
5843	}
5844	if (error)
5845	return error;
5846
5847	if (!old_dir->i_op->rename)
5848	return -EPERM;
5849
5850	/*
5851	* If we are going to change the parent - check write permissions,
5852	* we'll need to flip '..'.
5853	*/
5854	if (new_dir != old_dir) {
5855	if (is_dir) {
5856	error = inode_permission(rd->mnt_idmap, source,
5857	MAY_WRITE);
5858	if (error)
5859	return error;
5860	}
5861	if ((flags & RENAME_EXCHANGE) && new_is_dir) {
5862	error = inode_permission(rd->mnt_idmap, target,
5863	MAY_WRITE);
5864	if (error)
5865	return error;
5866	}
5867	}
5868
5869	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
5870	flags);
5871	if (error)
5872	return error;
5873
5874	take_dentry_name_snapshot(&old_name, old_dentry);
5875	dget(dentry: new_dentry);
5876	/*
5877	* Lock children.
5878	* The source subdirectory needs to be locked on cross-directory
5879	* rename or cross-directory exchange since its parent changes.
5880	* The target subdirectory needs to be locked on cross-directory
5881	* exchange due to parent change and on any rename due to becoming
5882	* a victim.
5883	* Non-directories need locking in all cases (for NFS reasons);
5884	* they get locked after any subdirectories (in inode address order).
5885	*
5886	* NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
5887	* NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
5888	*/
5889	lock_old_subdir = new_dir != old_dir;
5890	lock_new_subdir = new_dir != old_dir \|\| !(flags & RENAME_EXCHANGE);
5891	if (is_dir) {
5892	if (lock_old_subdir)
5893	inode_lock_nested(inode: source, subclass: I_MUTEX_CHILD);
5894	if (target && (!new_is_dir \|\| lock_new_subdir))
5895	inode_lock(inode: target);
5896	} else if (new_is_dir) {
5897	if (lock_new_subdir)
5898	inode_lock_nested(inode: target, subclass: I_MUTEX_CHILD);
5899	inode_lock(inode: source);
5900	} else {
5901	lock_two_nondirectories(source, target);
5902	}
5903
5904	error = -EPERM;
5905	if (IS_SWAPFILE(source) \|\| (target && IS_SWAPFILE(target)))
5906	goto out;
5907
5908	error = -EBUSY;
5909	if (is_local_mountpoint(dentry: old_dentry) \|\| is_local_mountpoint(dentry: new_dentry))
5910	goto out;
5911
5912	if (max_links && new_dir != old_dir) {
5913	error = -EMLINK;
5914	if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
5915	goto out;
5916	if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
5917	old_dir->i_nlink >= max_links)
5918	goto out;
5919	}
5920	error = try_break_deleg(inode: old_dir, di: delegated_inode);
5921	if (error)
5922	goto out;
5923	if (new_dir != old_dir) {
5924	error = try_break_deleg(inode: new_dir, di: delegated_inode);
5925	if (error)
5926	goto out;
5927	}
5928	if (!is_dir) {
5929	error = try_break_deleg(inode: source, di: delegated_inode);
5930	if (error)
5931	goto out;
5932	}
5933	if (target && !new_is_dir) {
5934	error = try_break_deleg(inode: target, di: delegated_inode);
5935	if (error)
5936	goto out;
5937	}
5938	error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry,
5939	new_dir, new_dentry, flags);
5940	if (error)
5941	goto out;
5942
5943	if (!(flags & RENAME_EXCHANGE) && target) {
5944	if (is_dir) {
5945	shrink_dcache_parent(new_dentry);
5946	target->i_flags \|= S_DEAD;
5947	}
5948	dont_mount(dentry: new_dentry);
5949	detach_mounts(dentry: new_dentry);
5950	}
5951	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
5952	if (!(flags & RENAME_EXCHANGE))
5953	d_move(old_dentry, new_dentry);
5954	else
5955	d_exchange(old_dentry, new_dentry);
5956	}
5957	out:
5958	if (!is_dir \|\| lock_old_subdir)
5959	inode_unlock(inode: source);
5960	if (target && (!new_is_dir \|\| lock_new_subdir))
5961	inode_unlock(inode: target);
5962	dput(new_dentry);
5963	if (!error) {
5964	fsnotify_move(old_dir, new_dir, old_name: &old_name.name, isdir: is_dir,
5965	target: !(flags & RENAME_EXCHANGE) ? target : NULL, moved: old_dentry);
5966	if (flags & RENAME_EXCHANGE) {
5967	fsnotify_move(old_dir: new_dir, new_dir: old_dir, old_name: &old_dentry->d_name,
5968	isdir: new_is_dir, NULL, moved: new_dentry);
5969	}
5970	}
5971	release_dentry_name_snapshot(&old_name);
5972
5973	return error;
5974	}
5975	EXPORT_SYMBOL(vfs_rename);
5976
5977	int do_renameat2(int olddfd, struct filename from, int* newdfd,
5978	struct filename to, unsigned* int flags)
5979	{
5980	struct renamedata rd;
5981	struct path old_path, new_path;
5982	struct qstr old_last, new_last;
5983	int old_type, new_type;
5984	struct delegated_inode delegated_inode = { };
5985	unsigned int lookup_flags = `0`;
5986	bool should_retry = false;
5987	int error = -EINVAL;
5988
5989	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
5990	goto put_names;
5991
5992	if ((flags & (RENAME_NOREPLACE \| RENAME_WHITEOUT)) &&
5993	(flags & RENAME_EXCHANGE))
5994	goto put_names;
5995
5996	retry:
5997	error = filename_parentat(dfd: olddfd, name: from, flags: lookup_flags, parent: &old_path,
5998	last: &old_last, type: &old_type);
5999	if (error)
6000	goto put_names;
6001
6002	error = filename_parentat(dfd: newdfd, name: to, flags: lookup_flags, parent: &new_path, last: &new_last,
6003	type: &new_type);
6004	if (error)
6005	goto exit1;
6006
6007	error = -EXDEV;
6008	if (old_path.mnt != new_path.mnt)
6009	goto exit2;
6010
6011	error = -EBUSY;
6012	if (old_type != LAST_NORM)
6013	goto exit2;
6014
6015	if (flags & RENAME_NOREPLACE)
6016	error = -EEXIST;
6017	if (new_type != LAST_NORM)
6018	goto exit2;
6019
6020	error = mnt_want_write(mnt: old_path.mnt);
6021	if (error)
6022	goto exit2;
6023
6024	retry_deleg:
6025	rd.old_parent = old_path.dentry;
6026	rd.mnt_idmap = mnt_idmap(mnt: old_path.mnt);
6027	rd.new_parent = new_path.dentry;
6028	rd.delegated_inode = &delegated_inode;
6029	rd.flags = flags;
6030
6031	error = __start_renaming(rd: &rd, lookup_flags, old_last: &old_last, new_last: &new_last);
6032	if (error)
6033	goto exit_lock_rename;
6034
6035	if (flags & RENAME_EXCHANGE) {
6036	if (!d_is_dir(dentry: rd.new_dentry)) {
6037	error = -ENOTDIR;
6038	if (new_last.name[new_last.len])
6039	goto exit_unlock;
6040	}
6041	}
6042	/ unless the source is a directory trailing slashes give -ENOTDIR /
6043	if (!d_is_dir(dentry: rd.old_dentry)) {
6044	error = -ENOTDIR;
6045	if (old_last.name[old_last.len])
6046	goto exit_unlock;
6047	if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
6048	goto exit_unlock;
6049	}
6050
6051	error = security_path_rename(old_dir: &old_path, old_dentry: rd.old_dentry,
6052	new_dir: &new_path, new_dentry: rd.new_dentry, flags);
6053	if (error)
6054	goto exit_unlock;
6055
6056	error = vfs_rename(&rd);
6057	exit_unlock:
6058	end_renaming(&rd);
6059	exit_lock_rename:
6060	if (is_delegated(di: &delegated_inode)) {
6061	error = break_deleg_wait(di: &delegated_inode);
6062	if (!error)
6063	goto retry_deleg;
6064	}
6065	mnt_drop_write(mnt: old_path.mnt);
6066	exit2:
6067	if (retry_estale(error, flags: lookup_flags))
6068	should_retry = true;
6069	path_put(&new_path);
6070	exit1:
6071	path_put(&old_path);
6072	if (should_retry) {
6073	should_retry = false;
6074	lookup_flags \|= LOOKUP_REVAL;
6075	goto retry;
6076	}
6077	put_names:
6078	putname(from);
6079	putname(to);
6080	return error;
6081	}
6082
6083	SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
6084	int, newdfd, const char __user , newname, unsigned* int, flags)
6085	{
6086	return do_renameat2(olddfd, from: getname(name: oldname), newdfd, to: getname(name: newname),
6087	flags);
6088	}
6089
6090	SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
6091	int, newdfd, const char __user *, newname)
6092	{
6093	return do_renameat2(olddfd, from: getname(name: oldname), newdfd, to: getname(name: newname),
6094	flags: `0`);
6095	}
6096
6097	SYSCALL_DEFINE2(rename, const char __user , oldname, const* char __user *, newname)
6098	{
6099	return do_renameat2(AT_FDCWD, from: getname(name: oldname), AT_FDCWD,
6100	to: getname(name: newname), flags: `0`);
6101	}
6102
6103	int readlink_copy(char __user buffer, int* buflen, const char link, int* linklen)
6104	{
6105	int copylen;
6106
6107	copylen = linklen;
6108	if (unlikely(copylen > (unsigned) buflen))
6109	copylen = buflen;
6110	if (copy_to_user(to: buffer, from: link, n: copylen))
6111	copylen = -EFAULT;
6112	return copylen;
6113	}
6114
6115	/**
6116	* vfs_readlink - copy symlink body into userspace buffer
6117	* @dentry: dentry on which to get symbolic link
6118	* @buffer: user memory pointer
6119	* @buflen: size of buffer
6120	*
6121	* Does not touch atime. That's up to the caller if necessary
6122	*
6123	* Does not call security hook.
6124	*/
6125	int vfs_readlink(struct dentry dentry, char* __user buffer, int* buflen)
6126	{
6127	struct inode *inode = d_inode(dentry);
6128	DEFINE_DELAYED_CALL(done);
6129	const char *link;
6130	int res;
6131
6132	if (inode->i_opflags & IOP_CACHED_LINK)
6133	return readlink_copy(buffer, buflen, link: inode->i_link, linklen: inode->i_linklen);
6134
6135	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
6136	if (unlikely(inode->i_op->readlink))
6137	return inode->i_op->readlink(dentry, buffer, buflen);
6138
6139	if (!d_is_symlink(dentry))
6140	return -EINVAL;
6141
6142	spin_lock(lock: &inode->i_lock);
6143	inode->i_opflags \|= IOP_DEFAULT_READLINK;
6144	spin_unlock(lock: &inode->i_lock);
6145	}
6146
6147	link = READ_ONCE(inode->i_link);
6148	if (!link) {
6149	link = inode->i_op->get_link(dentry, inode, &done);
6150	if (IS_ERR(ptr: link))
6151	return PTR_ERR(ptr: link);
6152	}
6153	res = readlink_copy(buffer, buflen, link, strlen(link));
6154	do_delayed_call(call: &done);
6155	return res;
6156	}
6157	EXPORT_SYMBOL(vfs_readlink);
6158
6159	/**
6160	* vfs_get_link - get symlink body
6161	* @dentry: dentry on which to get symbolic link
6162	* @done: caller needs to free returned data with this
6163	*
6164	* Calls security hook and i_op->get_link() on the supplied inode.
6165	*
6166	* It does not touch atime. That's up to the caller if necessary.
6167	*
6168	* Does not work on "special" symlinks like /proc/$$/fd/N
6169	*/
6170	const char vfs_get_link(struct* dentry dentry, struct* delayed_call *done)
6171	{
6172	const char *res = ERR_PTR(error: -EINVAL);
6173	struct inode *inode = d_inode(dentry);
6174
6175	if (d_is_symlink(dentry)) {
6176	res = ERR_PTR(error: security_inode_readlink(dentry));
6177	if (!res)
6178	res = inode->i_op->get_link(dentry, inode, done);
6179	}
6180	return res;
6181	}
6182	EXPORT_SYMBOL(vfs_get_link);
6183
6184	/ get the link contents into pagecache /
6185	static char __page_get_link(struct* dentry dentry, struct* inode *inode,
6186	struct delayed_call *callback)
6187	{
6188	struct folio *folio;
6189	struct address_space *mapping = inode->i_mapping;
6190
6191	if (!dentry) {
6192	folio = filemap_get_folio(mapping, index: `0`);
6193	if (IS_ERR(ptr: folio))
6194	return ERR_PTR(error: -ECHILD);
6195	if (!folio_test_uptodate(folio)) {
6196	folio_put(folio);
6197	return ERR_PTR(error: -ECHILD);
6198	}
6199	} else {
6200	folio = read_mapping_folio(mapping, index: `0`, NULL);
6201	if (IS_ERR(ptr: folio))
6202	return ERR_CAST(ptr: folio);
6203	}
6204	set_delayed_call(call: callback, fn: page_put_link, arg: folio);
6205	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
6206	return folio_address(folio);
6207	}
6208
6209	const char page_get_link_raw(struct* dentry dentry, struct* inode *inode,
6210	struct delayed_call *callback)
6211	{
6212	return __page_get_link(dentry, inode, callback);
6213	}
6214	EXPORT_SYMBOL_GPL(page_get_link_raw);
6215
6216	/**
6217	* page_get_link() - An implementation of the get_link inode_operation.
6218	* @dentry: The directory entry which is the symlink.
6219	* @inode: The inode for the symlink.
6220	* @callback: Used to drop the reference to the symlink.
6221	*
6222	* Filesystems which store their symlinks in the page cache should use
6223	* this to implement the get_link() member of their inode_operations.
6224	*
6225	* Return: A pointer to the NUL-terminated symlink.
6226	*/
6227	const char page_get_link(struct* dentry dentry, struct* inode *inode,
6228	struct delayed_call *callback)
6229	{
6230	char *kaddr = __page_get_link(dentry, inode, callback);
6231
6232	if (!IS_ERR(ptr: kaddr))
6233	nd_terminate_link(name: kaddr, len: inode->i_size, PAGE_SIZE - `1`);
6234	return kaddr;
6235	}
6236	EXPORT_SYMBOL(page_get_link);
6237
6238	/**
6239	* page_put_link() - Drop the reference to the symlink.
6240	* @arg: The folio which contains the symlink.
6241	*
6242	* This is used internally by page_get_link(). It is exported for use
6243	* by filesystems which need to implement a variant of page_get_link()
6244	* themselves. Despite the apparent symmetry, filesystems which use
6245	* page_get_link() do not need to call page_put_link().
6246	*
6247	* The argument, while it has a void pointer type, must be a pointer to
6248	* the folio which was retrieved from the page cache. The delayed_call
6249	* infrastructure is used to drop the reference count once the caller
6250	* is done with the symlink.
6251	*/
6252	void page_put_link(void *arg)
6253	{
6254	folio_put(folio: arg);
6255	}
6256	EXPORT_SYMBOL(page_put_link);
6257
6258	int page_readlink(struct dentry dentry, char* __user buffer, int* buflen)
6259	{
6260	const char *link;
6261	int res;
6262
6263	DEFINE_DELAYED_CALL(done);
6264	link = page_get_link(dentry, d_inode(dentry), &done);
6265	res = PTR_ERR(ptr: link);
6266	if (!IS_ERR(ptr: link))
6267	res = readlink_copy(buffer, buflen, link, strlen(link));
6268	do_delayed_call(call: &done);
6269	return res;
6270	}
6271	EXPORT_SYMBOL(page_readlink);
6272
6273	int page_symlink(struct inode inode, const* char symname, int* len)
6274	{
6275	struct address_space *mapping = inode->i_mapping;
6276	const struct address_space_operations *aops = mapping->a_ops;
6277	bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
6278	struct folio *folio;
6279	void *fsdata = NULL;
6280	int err;
6281	unsigned int flags;
6282
6283	retry:
6284	if (nofs)
6285	flags = memalloc_nofs_save();
6286	err = aops->write_begin(NULL, mapping, `0`, len-`1`, &folio, &fsdata);
6287	if (nofs)
6288	memalloc_nofs_restore(flags);
6289	if (err)
6290	goto fail;
6291
6292	memcpy(folio_address(folio), symname, len - `1`);
6293
6294	err = aops->write_end(NULL, mapping, `0`, len - `1`, len - `1`,
6295	folio, fsdata);
6296	if (err < `0`)
6297	goto fail;
6298	if (err < len-`1`)
6299	goto retry;
6300
6301	mark_inode_dirty(inode);
6302	return `0`;
6303	fail:
6304	return err;
6305	}
6306	EXPORT_SYMBOL(page_symlink);
6307
6308	const struct inode_operations page_symlink_inode_operations = {
6309	.get_link = page_get_link,
6310	};
6311	EXPORT_SYMBOL(page_symlink_inode_operations);
6312

source code of linux/fs/namei.c