tree-log.c source code [linux/fs/btrfs/tree-log.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2008 Oracle. All rights reserved.
4	*/
5
6	#include <linux/sched.h>
7	#include <linux/slab.h>
8	#include <linux/blkdev.h>
9	#include <linux/list_sort.h>
10	#include <linux/iversion.h>
11	#include "misc.h"
12	#include "ctree.h"
13	#include "tree-log.h"
14	#include "disk-io.h"
15	#include "locking.h"
16	#include "backref.h"
17	#include "compression.h"
18	#include "qgroup.h"
19	#include "block-group.h"
20	#include "space-info.h"
21	#include "inode-item.h"
22	#include "fs.h"
23	#include "accessors.h"
24	#include "extent-tree.h"
25	#include "root-tree.h"
26	#include "dir-item.h"
27	#include "file-item.h"
28	#include "file.h"
29	#include "orphan.h"
30	#include "print-tree.h"
31	#include "tree-checker.h"
32	#include "delayed-inode.h"
33
34	#define MAX_CONFLICT_INODES 10
35
36	/ magic values for the inode_only field in btrfs_log_inode:*
37	*
38	* LOG_INODE_ALL means to log everything
39	* LOG_INODE_EXISTS means to log just enough to recreate the inode
40	* during log replay
41	*/
42	enum {
43	LOG_INODE_ALL,
44	LOG_INODE_EXISTS,
45	};
46
47	/*
48	* directory trouble cases
49	*
50	* 1) on rename or unlink, if the inode being unlinked isn't in the fsync
51	* log, we must force a full commit before doing an fsync of the directory
52	* where the unlink was done.
53	* ---> record transid of last unlink/rename per directory
54	*
55	* mkdir foo/some_dir
56	* normal commit
57	* rename foo/some_dir foo2/some_dir
58	* mkdir foo/some_dir
59	* fsync foo/some_dir/some_file
60	*
61	* The fsync above will unlink the original some_dir without recording
62	* it in its new location (foo2). After a crash, some_dir will be gone
63	* unless the fsync of some_file forces a full commit
64	*
65	* 2) we must log any new names for any file or dir that is in the fsync
66	* log. ---> check inode while renaming/linking.
67	*
68	* 2a) we must log any new names for any file or dir during rename
69	* when the directory they are being removed from was logged.
70	* ---> check inode and old parent dir during rename
71	*
72	* 2a is actually the more important variant. With the extra logging
73	* a crash might unlink the old name without recreating the new one
74	*
75	* 3) after a crash, we must go through any directories with a link count
76	* of zero and redo the rm -rf
77	*
78	* mkdir f1/foo
79	* normal commit
80	* rm -rf f1/foo
81	* fsync(f1)
82	*
83	* The directory f1 was fully removed from the FS, but fsync was never
84	* called on f1, only its parent dir. After a crash the rm -rf must
85	* be replayed. This must be able to recurse down the entire
86	* directory tree. The inode link count fixup code takes care of the
87	* ugly details.
88	*/
89
90	/*
91	* stages for the tree walking. The first
92	* stage (0) is to only pin down the blocks we find
93	* the second stage (1) is to make sure that all the inodes
94	* we find in the log are created in the subvolume.
95	*
96	* The last stage is to deal with directories and links and extents
97	* and all the other fun semantics
98	*/
99	enum {
100	LOG_WALK_PIN_ONLY,
101	LOG_WALK_REPLAY_INODES,
102	LOG_WALK_REPLAY_DIR_INDEX,
103	LOG_WALK_REPLAY_ALL,
104	};
105
106	/*
107	* The walk control struct is used to pass state down the chain when processing
108	* the log tree. The stage field tells us which part of the log tree processing
109	* we are currently doing.
110	*/
111	struct walk_control {
112	/*
113	* Signal that we are freeing the metadata extents of a log tree.
114	* This is used at transaction commit time while freeing a log tree.
115	*/
116	bool free;
117
118	/*
119	* Signal that we are pinning the metadata extents of a log tree and the
120	* data extents its leaves point to (if using mixed block groups).
121	* This happens in the first stage of log replay to ensure that during
122	* replay, while we are modifying subvolume trees, we don't overwrite
123	* the metadata extents of log trees.
124	*/
125	bool pin;
126
127	/ What stage of the replay code we're currently in. /
128	int stage;
129
130	/*
131	* Ignore any items from the inode currently being processed. Needs
132	* to be set every time we find a BTRFS_INODE_ITEM_KEY.
133	*/
134	bool ignore_cur_inode;
135
136	/*
137	* The root we are currently replaying to. This is NULL for the replay
138	* stage LOG_WALK_PIN_ONLY.
139	*/
140	struct btrfs_root *root;
141
142	/ The log tree we are currently processing (not NULL for any stage). /
143	struct btrfs_root *log;
144
145	/ The transaction handle used for replaying all log trees. /
146	struct btrfs_trans_handle *trans;
147
148	/*
149	* The function that gets used to process blocks we find in the tree.
150	* Note the extent_buffer might not be up to date when it is passed in,
151	* and it must be checked or read if you need the data inside it.
152	*/
153	int (process_func)(struct* extent_buffer *eb,
154	struct walk_control wc, u64 gen, int* level);
155
156	/*
157	* The following are used only when stage is >= LOG_WALK_REPLAY_INODES
158	* and by the replay_one_buffer() callback.
159	*/
160
161	/ The current log leaf being processed. /
162	struct extent_buffer *log_leaf;
163	/ The key being processed of the current log leaf. /
164	struct btrfs_key log_key;
165	/ The slot being processed of the current log leaf. /
166	int log_slot;
167
168	/ A path used for searches and modifications to subvolume trees. /
169	struct btrfs_path *subvol_path;
170	};
171
172	static void do_abort_log_replay(struct walk_control wc, const* char *function,
173	unsigned int line, int error, const char *fmt, ...)
174	{
175	struct btrfs_fs_info *fs_info = wc->trans->fs_info;
176	struct va_format vaf;
177	va_list args;
178
179	/*
180	* Do nothing if we already aborted, to avoid dumping leaves again which
181	* can be verbose. Further more, only the first call is useful since it
182	* is where we have a problem. Note that we do not use the flag
183	* BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
184	* are outside of tree-log.c that can abort transactions (such as
185	* btrfs_add_link() for example), so if that happens we still want to
186	* dump all log replay specific information below.
187	*/
188	if (test_and_set_bit(nr: BTRFS_FS_STATE_LOG_REPLAY_ABORTED, addr: &fs_info->fs_state))
189	return;
190
191	btrfs_abort_transaction(wc->trans, error);
192
193	if (wc->subvol_path && wc->subvol_path->nodes[`0`]) {
194	btrfs_crit(fs_info,
195	"subvolume (root %llu) leaf currently being processed:",
196	btrfs_root_id(wc->root));
197	btrfs_print_leaf(l: wc->subvol_path->nodes[`0`]);
198	}
199
200	if (wc->log_leaf) {
201	btrfs_crit(fs_info,
202	"log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):",
203	btrfs_root_id(wc->root), wc->log_slot,
204	BTRFS_KEY_FMT_VALUE(&wc->log_key));
205	btrfs_print_leaf(l: wc->log_leaf);
206	}
207
208	va_start(args, fmt);
209	vaf.fmt = fmt;
210	vaf.va = &args;
211
212	btrfs_crit(fs_info,
213	"log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
214	function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
215
216	va_end(args);
217	}
218
219	/*
220	* Use this for aborting a transaction during log replay while we are down the
221	* call chain of replay_one_buffer(), so that we get a lot more useful
222	* information for debugging issues when compared to a plain call to
223	* btrfs_abort_transaction().
224	*/
225	#define btrfs_abort_log_replay(wc, error, fmt, args...) \
226	do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
227
228	static int btrfs_log_inode(struct btrfs_trans_handle *trans,
229	struct btrfs_inode *inode,
230	int inode_only,
231	struct btrfs_log_ctx *ctx);
232	static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
233	static noinline int replay_dir_deletes(struct walk_control *wc,
234	u64 dirid, bool del_all);
235	static void wait_log_commit(struct btrfs_root root, int* transid);
236
237	/*
238	* tree logging is a special write ahead log used to make sure that
239	* fsyncs and O_SYNCs can happen without doing full tree commits.
240	*
241	* Full tree commits are expensive because they require commonly
242	* modified blocks to be recowed, creating many dirty pages in the
243	* extent tree an 4x-6x higher write load than ext3.
244	*
245	* Instead of doing a tree commit on every fsync, we use the
246	* key ranges and transaction ids to find items for a given file or directory
247	* that have changed in this transaction. Those items are copied into
248	* a special tree (one per subvolume root), that tree is written to disk
249	* and then the fsync is considered complete.
250	*
251	* After a crash, items are copied out of the log-tree back into the
252	* subvolume tree. Any file data extents found are recorded in the extent
253	* allocation tree, and the log-tree freed.
254	*
255	* The log tree is read three times, once to pin down all the extents it is
256	* using in ram and once, once to create all the inodes logged in the tree
257	* and once to do all the other items.
258	*/
259
260	static struct btrfs_inode btrfs_iget_logging(u64 objectid, struct* btrfs_root *root)
261	{
262	unsigned int nofs_flag;
263	struct btrfs_inode *inode;
264
265	/ Only meant to be called for subvolume roots and not for log roots. /
266	ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root));
267
268	/*
269	* We're holding a transaction handle whether we are logging or
270	* replaying a log tree, so we must make sure NOFS semantics apply
271	* because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
272	* to allocate an inode, which can recurse back into the filesystem and
273	* attempt a transaction commit, resulting in a deadlock.
274	*/
275	nofs_flag = memalloc_nofs_save();
276	inode = btrfs_iget(ino: objectid, root);
277	memalloc_nofs_restore(flags: nofs_flag);
278
279	return inode;
280	}
281
282	/*
283	* start a sub transaction and setup the log tree
284	* this increments the log tree writer count to make the people
285	* syncing the tree wait for us to finish
286	*/
287	static int start_log_trans(struct btrfs_trans_handle *trans,
288	struct btrfs_root *root,
289	struct btrfs_log_ctx *ctx)
290	{
291	struct btrfs_fs_info *fs_info = root->fs_info;
292	struct btrfs_root *tree_root = fs_info->tree_root;
293	const bool zoned = btrfs_is_zoned(fs_info);
294	int ret = `0`;
295	bool created = false;
296
297	/*
298	* First check if the log root tree was already created. If not, create
299	* it before locking the root's log_mutex, just to keep lockdep happy.
300	*/
301	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
302	mutex_lock(&tree_root->log_mutex);
303	if (!fs_info->log_root_tree) {
304	ret = btrfs_init_log_root_tree(trans, fs_info);
305	if (!ret) {
306	set_bit(nr: BTRFS_ROOT_HAS_LOG_TREE, addr: &tree_root->state);
307	created = true;
308	}
309	}
310	mutex_unlock(lock: &tree_root->log_mutex);
311	if (ret)
312	return ret;
313	}
314
315	mutex_lock(&root->log_mutex);
316
317	again:
318	if (root->log_root) {
319	int index = (root->log_transid + `1`) % `2`;
320
321	if (btrfs_need_log_full_commit(trans)) {
322	ret = BTRFS_LOG_FORCE_COMMIT;
323	goto out;
324	}
325
326	if (zoned && atomic_read(v: &root->log_commit[index])) {
327	wait_log_commit(root, transid: root->log_transid - `1`);
328	goto again;
329	}
330
331	if (!root->log_start_pid) {
332	clear_bit(nr: BTRFS_ROOT_MULTI_LOG_TASKS, addr: &root->state);
333	root->log_start_pid = current->pid;
334	} else if (root->log_start_pid != current->pid) {
335	set_bit(nr: BTRFS_ROOT_MULTI_LOG_TASKS, addr: &root->state);
336	}
337	} else {
338	/*
339	* This means fs_info->log_root_tree was already created
340	* for some other FS trees. Do the full commit not to mix
341	* nodes from multiple log transactions to do sequential
342	* writing.
343	*/
344	if (zoned && !created) {
345	ret = BTRFS_LOG_FORCE_COMMIT;
346	goto out;
347	}
348
349	ret = btrfs_add_log_tree(trans, root);
350	if (ret)
351	goto out;
352
353	set_bit(nr: BTRFS_ROOT_HAS_LOG_TREE, addr: &root->state);
354	clear_bit(nr: BTRFS_ROOT_MULTI_LOG_TASKS, addr: &root->state);
355	root->log_start_pid = current->pid;
356	}
357
358	atomic_inc(v: &root->log_writers);
359	if (!ctx->logging_new_name) {
360	int index = root->log_transid % `2`;
361	list_add_tail(new: &ctx->list, head: &root->log_ctxs[index]);
362	ctx->log_transid = root->log_transid;
363	}
364
365	out:
366	mutex_unlock(lock: &root->log_mutex);
367	return ret;
368	}
369
370	/*
371	* returns 0 if there was a log transaction running and we were able
372	* to join, or returns -ENOENT if there were not transactions
373	* in progress
374	*/
375	static int join_running_log_trans(struct btrfs_root *root)
376	{
377	const bool zoned = btrfs_is_zoned(fs_info: root->fs_info);
378	int ret = -ENOENT;
379
380	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
381	return ret;
382
383	mutex_lock(&root->log_mutex);
384	again:
385	if (root->log_root) {
386	int index = (root->log_transid + `1`) % `2`;
387
388	ret = `0`;
389	if (zoned && atomic_read(v: &root->log_commit[index])) {
390	wait_log_commit(root, transid: root->log_transid - `1`);
391	goto again;
392	}
393	atomic_inc(v: &root->log_writers);
394	}
395	mutex_unlock(lock: &root->log_mutex);
396	return ret;
397	}
398
399	/*
400	* This either makes the current running log transaction wait
401	* until you call btrfs_end_log_trans() or it makes any future
402	* log transactions wait until you call btrfs_end_log_trans()
403	*/
404	void btrfs_pin_log_trans(struct btrfs_root *root)
405	{
406	atomic_inc(v: &root->log_writers);
407	}
408
409	/*
410	* indicate we're done making changes to the log tree
411	* and wake up anyone waiting to do a sync
412	*/
413	void btrfs_end_log_trans(struct btrfs_root *root)
414	{
415	if (atomic_dec_and_test(v: &root->log_writers)) {
416	/ atomic_dec_and_test implies a barrier /
417	cond_wake_up_nomb(wq: &root->log_writer_wait);
418	}
419	}
420
421	/*
422	* process_func used to pin down extents, write them or wait on them
423	*/
424	static int process_one_buffer(struct extent_buffer *eb,
425	struct walk_control wc, u64 gen, int* level)
426	{
427	struct btrfs_root *log = wc->log;
428	struct btrfs_trans_handle *trans = wc->trans;
429	struct btrfs_fs_info *fs_info = log->fs_info;
430	int ret = `0`;
431
432	/*
433	* If this fs is mixed then we need to be able to process the leaves to
434	* pin down any logged extents, so we have to read the block.
435	*/
436	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
437	struct btrfs_tree_parent_check check = {
438	.level = level,
439	.transid = gen
440	};
441
442	ret = btrfs_read_extent_buffer(buf: eb, check: &check);
443	if (unlikely(ret)) {
444	if (trans)
445	btrfs_abort_transaction(trans, ret);
446	else
447	btrfs_handle_fs_error(fs_info, ret, NULL);
448	return ret;
449	}
450	}
451
452	if (wc->pin) {
453	ASSERT(trans != NULL);
454	ret = btrfs_pin_extent_for_log_replay(trans, eb);
455	if (unlikely(ret)) {
456	btrfs_abort_transaction(trans, ret);
457	return ret;
458	}
459
460	if (btrfs_buffer_uptodate(buf: eb, parent_transid: gen, atomic: false) && level == `0`) {
461	ret = btrfs_exclude_logged_extents(eb);
462	if (ret)
463	btrfs_abort_transaction(trans, ret);
464	}
465	}
466	return ret;
467	}
468
469	/*
470	* Item overwrite used by log replay. The given log tree leaf, slot and key
471	* from the walk_control structure all refer to the source data we are copying
472	* out.
473	*
474	* The given root is for the tree we are copying into, and path is a scratch
475	* path for use in this function (it should be released on entry and will be
476	* released on exit).
477	*
478	* If the key is already in the destination tree the existing item is
479	* overwritten. If the existing item isn't big enough, it is extended.
480	* If it is too large, it is truncated.
481	*
482	* If the key isn't in the destination yet, a new item is inserted.
483	*/
484	static int overwrite_item(struct walk_control *wc)
485	{
486	struct btrfs_trans_handle *trans = wc->trans;
487	struct btrfs_root *root = wc->root;
488	int ret;
489	u32 item_size;
490	u64 saved_i_size = `0`;
491	int save_old_i_size = `0`;
492	unsigned long src_ptr;
493	unsigned long dst_ptr;
494	struct extent_buffer *dst_eb;
495	int dst_slot;
496	const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
497
498	/*
499	* This is only used during log replay, so the root is always from a
500	* fs/subvolume tree. In case we ever need to support a log root, then
501	* we'll have to clone the leaf in the path, release the path and use
502	* the leaf before writing into the log tree. See the comments at
503	* copy_items() for more details.
504	*/
505	ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root));
506
507	item_size = btrfs_item_size(eb: wc->log_leaf, slot: wc->log_slot);
508	src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
509
510	/ Look for the key in the destination tree. /
511	ret = btrfs_search_slot(NULL, root, key: &wc->log_key, p: wc->subvol_path, ins_len: `0`, cow: `0`);
512	if (ret < `0`) {
513	btrfs_abort_log_replay(wc, ret,
514	"failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
515	BTRFS_KEY_FMT_VALUE(&wc->log_key),
516	btrfs_root_id(root));
517	return ret;
518	}
519
520	dst_eb = wc->subvol_path->nodes[`0`];
521	dst_slot = wc->subvol_path->slots[`0`];
522
523	if (ret == `0`) {
524	char *src_copy;
525	const u32 dst_size = btrfs_item_size(eb: dst_eb, slot: dst_slot);
526
527	if (dst_size != item_size)
528	goto insert;
529
530	if (item_size == `0`) {
531	btrfs_release_path(p: wc->subvol_path);
532	return `0`;
533	}
534	src_copy = kmalloc(item_size, GFP_NOFS);
535	if (!src_copy) {
536	btrfs_abort_log_replay(wc, -ENOMEM,
537	"failed to allocate memory for log leaf item");
538	return -ENOMEM;
539	}
540
541	read_extent_buffer(eb: wc->log_leaf, dst: src_copy, start: src_ptr, len: item_size);
542	dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
543	ret = memcmp_extent_buffer(eb: dst_eb, ptrv: src_copy, start: dst_ptr, len: item_size);
544
545	kfree(objp: src_copy);
546	/*
547	* they have the same contents, just return, this saves
548	* us from cowing blocks in the destination tree and doing
549	* extra writes that may not have been done by a previous
550	* sync
551	*/
552	if (ret == `0`) {
553	btrfs_release_path(p: wc->subvol_path);
554	return `0`;
555	}
556
557	/*
558	* We need to load the old nbytes into the inode so when we
559	* replay the extents we've logged we get the right nbytes.
560	*/
561	if (is_inode_item) {
562	struct btrfs_inode_item *item;
563	u64 nbytes;
564	u32 mode;
565
566	item = btrfs_item_ptr(dst_eb, dst_slot,
567	struct btrfs_inode_item);
568	nbytes = btrfs_inode_nbytes(eb: dst_eb, s: item);
569	item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
570	struct btrfs_inode_item);
571	btrfs_set_inode_nbytes(eb: wc->log_leaf, s: item, val: nbytes);
572
573	/*
574	* If this is a directory we need to reset the i_size to
575	* 0 so that we can set it up properly when replaying
576	* the rest of the items in this log.
577	*/
578	mode = btrfs_inode_mode(eb: wc->log_leaf, s: item);
579	if (S_ISDIR(mode))
580	btrfs_set_inode_size(eb: wc->log_leaf, s: item, val: `0`);
581	}
582	} else if (is_inode_item) {
583	struct btrfs_inode_item *item;
584	u32 mode;
585
586	/*
587	* New inode, set nbytes to 0 so that the nbytes comes out
588	* properly when we replay the extents.
589	*/
590	item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
591	btrfs_set_inode_nbytes(eb: wc->log_leaf, s: item, val: `0`);
592
593	/*
594	* If this is a directory we need to reset the i_size to 0 so
595	* that we can set it up properly when replaying the rest of
596	* the items in this log.
597	*/
598	mode = btrfs_inode_mode(eb: wc->log_leaf, s: item);
599	if (S_ISDIR(mode))
600	btrfs_set_inode_size(eb: wc->log_leaf, s: item, val: `0`);
601	}
602	insert:
603	btrfs_release_path(p: wc->subvol_path);
604	/ try to insert the key into the destination tree /
605	wc->subvol_path->skip_release_on_error = true;
606	ret = btrfs_insert_empty_item(trans, root, path: wc->subvol_path, key: &wc->log_key, data_size: item_size);
607	wc->subvol_path->skip_release_on_error = false;
608
609	dst_eb = wc->subvol_path->nodes[`0`];
610	dst_slot = wc->subvol_path->slots[`0`];
611
612	/ make sure any existing item is the correct size /
613	if (ret == -EEXIST \|\| ret == -EOVERFLOW) {
614	const u32 found_size = btrfs_item_size(eb: dst_eb, slot: dst_slot);
615
616	if (found_size > item_size)
617	btrfs_truncate_item(trans, path: wc->subvol_path, new_size: item_size, from_end: `1`);
618	else if (found_size < item_size)
619	btrfs_extend_item(trans, path: wc->subvol_path, data_size: item_size - found_size);
620	} else if (ret) {
621	btrfs_abort_log_replay(wc, ret,
622	"failed to insert item for key " BTRFS_KEY_FMT,
623	BTRFS_KEY_FMT_VALUE(&wc->log_key));
624	return ret;
625	}
626	dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
627
628	/ don't overwrite an existing inode if the generation number*
629	* was logged as zero. This is done when the tree logging code
630	* is just logging an inode to make sure it exists after recovery.
631	*
632	* Also, don't overwrite i_size on directories during replay.
633	* log replay inserts and removes directory items based on the
634	* state of the tree found in the subvolume, and i_size is modified
635	* as it goes
636	*/
637	if (is_inode_item && ret == -EEXIST) {
638	struct btrfs_inode_item *src_item;
639	struct btrfs_inode_item *dst_item;
640
641	src_item = (struct btrfs_inode_item *)src_ptr;
642	dst_item = (struct btrfs_inode_item *)dst_ptr;
643
644	if (btrfs_inode_generation(eb: wc->log_leaf, s: src_item) == `0`) {
645	const u64 ino_size = btrfs_inode_size(eb: wc->log_leaf, s: src_item);
646
647	/*
648	* For regular files an ino_size == 0 is used only when
649	* logging that an inode exists, as part of a directory
650	* fsync, and the inode wasn't fsynced before. In this
651	* case don't set the size of the inode in the fs/subvol
652	* tree, otherwise we would be throwing valid data away.
653	*/
654	if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
655	S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
656	ino_size != `0`)
657	btrfs_set_inode_size(eb: dst_eb, s: dst_item, val: ino_size);
658	goto no_copy;
659	}
660
661	if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
662	S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
663	save_old_i_size = `1`;
664	saved_i_size = btrfs_inode_size(eb: dst_eb, s: dst_item);
665	}
666	}
667
668	copy_extent_buffer(dst: dst_eb, src: wc->log_leaf, dst_offset: dst_ptr, src_offset: src_ptr, len: item_size);
669
670	if (save_old_i_size) {
671	struct btrfs_inode_item *dst_item;
672
673	dst_item = (struct btrfs_inode_item *)dst_ptr;
674	btrfs_set_inode_size(eb: dst_eb, s: dst_item, val: saved_i_size);
675	}
676
677	/ make sure the generation is filled in /
678	if (is_inode_item) {
679	struct btrfs_inode_item *dst_item;
680
681	dst_item = (struct btrfs_inode_item *)dst_ptr;
682	if (btrfs_inode_generation(eb: dst_eb, s: dst_item) == `0`)
683	btrfs_set_inode_generation(eb: dst_eb, s: dst_item, val: trans->transid);
684	}
685	no_copy:
686	btrfs_release_path(p: wc->subvol_path);
687	return `0`;
688	}
689
690	static int read_alloc_one_name(struct extent_buffer eb, void* start, int* len,
691	struct fscrypt_str *name)
692	{
693	char *buf;
694
695	buf = kmalloc(len, GFP_NOFS);
696	if (!buf)
697	return -ENOMEM;
698
699	read_extent_buffer(eb, dst: buf, start: (unsigned long)start, len);
700	name->name = buf;
701	name->len = len;
702	return `0`;
703	}
704
705	/ replays a single extent in 'eb' at 'slot' with 'key' into the*
706	* subvolume 'root'. path is released on entry and should be released
707	* on exit.
708	*
709	* extents in the log tree have not been allocated out of the extent
710	* tree yet. So, this completes the allocation, taking a reference
711	* as required if the extent already exists or creating a new extent
712	* if it isn't in the extent allocation tree yet.
713	*
714	* The extent is inserted into the file, dropping any existing extents
715	* from the file that overlap the new one.
716	*/
717	static noinline int replay_one_extent(struct walk_control *wc)
718	{
719	struct btrfs_trans_handle *trans = wc->trans;
720	struct btrfs_root *root = wc->root;
721	struct btrfs_drop_extents_args drop_args = { `0` };
722	struct btrfs_fs_info *fs_info = root->fs_info;
723	int found_type;
724	u64 extent_end;
725	const u64 start = wc->log_key.offset;
726	u64 nbytes = `0`;
727	u64 csum_start;
728	u64 csum_end;
729	LIST_HEAD(ordered_sums);
730	u64 offset;
731	unsigned long dest_offset;
732	struct btrfs_key ins;
733	struct btrfs_file_extent_item *item;
734	struct btrfs_inode *inode = NULL;
735	int ret = `0`;
736
737	item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
738	found_type = btrfs_file_extent_type(eb: wc->log_leaf, s: item);
739
740	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
741	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
742	extent_end = start + btrfs_file_extent_num_bytes(eb: wc->log_leaf, s: item);
743	/ Holes don't take up space. /
744	if (btrfs_file_extent_disk_bytenr(eb: wc->log_leaf, s: item) != `0`)
745	nbytes = btrfs_file_extent_num_bytes(eb: wc->log_leaf, s: item);
746	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
747	nbytes = btrfs_file_extent_ram_bytes(eb: wc->log_leaf, s: item);
748	extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
749	} else {
750	btrfs_abort_log_replay(wc, -EUCLEAN,
751	"unexpected extent type=%d root=%llu inode=%llu offset=%llu",
752	found_type, btrfs_root_id(root),
753	wc->log_key.objectid, wc->log_key.offset);
754	return -EUCLEAN;
755	}
756
757	inode = btrfs_iget_logging(objectid: wc->log_key.objectid, root);
758	if (IS_ERR(ptr: inode)) {
759	ret = PTR_ERR(ptr: inode);
760	btrfs_abort_log_replay(wc, ret,
761	"failed to get inode %llu for root %llu",
762	wc->log_key.objectid, btrfs_root_id(root));
763	return ret;
764	}
765
766	/*
767	* first check to see if we already have this extent in the
768	* file. This must be done before the btrfs_drop_extents run
769	* so we don't try to drop this extent.
770	*/
771	ret = btrfs_lookup_file_extent(trans, root, path: wc->subvol_path,
772	objectid: btrfs_ino(inode), bytenr: start, mod: `0`);
773
774	if (ret == `0` &&
775	(found_type == BTRFS_FILE_EXTENT_REG \|\|
776	found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
777	struct extent_buffer *leaf = wc->subvol_path->nodes[`0`];
778	struct btrfs_file_extent_item existing;
779	unsigned long ptr;
780
781	ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[`0`]);
782	read_extent_buffer(eb: leaf, dst: &existing, start: ptr, len: sizeof(existing));
783
784	/*
785	* we already have a pointer to this exact extent,
786	* we don't have to do anything
787	*/
788	if (memcmp_extent_buffer(eb: wc->log_leaf, ptrv: &existing, start: (unsigned long)item,
789	len: sizeof(existing)) == `0`) {
790	btrfs_release_path(p: wc->subvol_path);
791	goto out;
792	}
793	}
794	btrfs_release_path(p: wc->subvol_path);
795
796	/ drop any overlapping extents /
797	drop_args.start = start;
798	drop_args.end = extent_end;
799	drop_args.drop_cache = true;
800	drop_args.path = wc->subvol_path;
801	ret = btrfs_drop_extents(trans, root, inode, args: &drop_args);
802	if (ret) {
803	btrfs_abort_log_replay(wc, ret,
804	"failed to drop extents for inode %llu range [%llu, %llu) root %llu",
805	wc->log_key.objectid, start, extent_end,
806	btrfs_root_id(root));
807	goto out;
808	}
809
810	if (found_type == BTRFS_FILE_EXTENT_INLINE) {
811	/ inline extents are easy, we just overwrite them /
812	ret = overwrite_item(wc);
813	if (ret)
814	goto out;
815	goto update_inode;
816	}
817
818	/*
819	* If not an inline extent, it can only be a regular or prealloc one.
820	* We have checked that above and returned -EUCLEAN if not.
821	*/
822
823	/ A hole and NO_HOLES feature enabled, nothing else to do. /
824	if (btrfs_file_extent_disk_bytenr(eb: wc->log_leaf, s: item) == `0` &&
825	btrfs_fs_incompat(fs_info, NO_HOLES))
826	goto update_inode;
827
828	ret = btrfs_insert_empty_item(trans, root, path: wc->subvol_path,
829	key: &wc->log_key, data_size: sizeof(*item));
830	if (ret) {
831	btrfs_abort_log_replay(wc, ret,
832	"failed to insert item with key " BTRFS_KEY_FMT " root %llu",
833	BTRFS_KEY_FMT_VALUE(&wc->log_key),
834	btrfs_root_id(root));
835	goto out;
836	}
837	dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[`0`],
838	wc->subvol_path->slots[`0`]);
839	copy_extent_buffer(dst: wc->subvol_path->nodes[`0`], src: wc->log_leaf, dst_offset: dest_offset,
840	src_offset: (unsigned long)item, len: sizeof(*item));
841
842	/*
843	* We have an explicit hole and NO_HOLES is not enabled. We have added
844	* the hole file extent item to the subvolume tree, so we don't have
845	* anything else to do other than update the file extent item range and
846	* update the inode item.
847	*/
848	if (btrfs_file_extent_disk_bytenr(eb: wc->log_leaf, s: item) == `0`) {
849	btrfs_release_path(p: wc->subvol_path);
850	goto update_inode;
851	}
852
853	ins.objectid = btrfs_file_extent_disk_bytenr(eb: wc->log_leaf, s: item);
854	ins.type = BTRFS_EXTENT_ITEM_KEY;
855	ins.offset = btrfs_file_extent_disk_num_bytes(eb: wc->log_leaf, s: item);
856	offset = wc->log_key.offset - btrfs_file_extent_offset(eb: wc->log_leaf, s: item);
857
858	/*
859	* Manually record dirty extent, as here we did a shallow file extent
860	* item copy and skip normal backref update, but modifying extent tree
861	* all by ourselves. So need to manually record dirty extent for qgroup,
862	* as the owner of the file extent changed from log tree (doesn't affect
863	* qgroup) to fs/file tree (affects qgroup).
864	*/
865	ret = btrfs_qgroup_trace_extent(trans, bytenr: ins.objectid, num_bytes: ins.offset);
866	if (ret < `0`) {
867	btrfs_abort_log_replay(wc, ret,
868	"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
869	ins.objectid, ins.offset,
870	wc->log_key.objectid, btrfs_root_id(root));
871	goto out;
872	}
873
874	/*
875	* Is this extent already allocated in the extent tree?
876	* If so, just add a reference.
877	*/
878	ret = btrfs_lookup_data_extent(fs_info, start: ins.objectid, len: ins.offset);
879	if (ret < `0`) {
880	btrfs_abort_log_replay(wc, ret,
881	"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
882	ins.objectid, ins.offset,
883	wc->log_key.objectid, btrfs_root_id(root));
884	goto out;
885	} else if (ret == `0`) {
886	struct btrfs_ref ref = {
887	.action = BTRFS_ADD_DELAYED_REF,
888	.bytenr = ins.objectid,
889	.num_bytes = ins.offset,
890	.owning_root = btrfs_root_id(root),
891	.ref_root = btrfs_root_id(root),
892	};
893
894	btrfs_init_data_ref(generic_ref: &ref, ino: wc->log_key.objectid, offset, mod_root: `0`, skip_qgroup: false);
895	ret = btrfs_inc_extent_ref(trans, generic_ref: &ref);
896	if (ret) {
897	btrfs_abort_log_replay(wc, ret,
898	"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
899	ins.objectid, ins.offset,
900	wc->log_key.objectid,
901	btrfs_root_id(root));
902	goto out;
903	}
904	} else {
905	/ Insert the extent pointer in the extent tree. /
906	ret = btrfs_alloc_logged_file_extent(trans, root_objectid: btrfs_root_id(root),
907	owner: wc->log_key.objectid, offset, ins: &ins);
908	if (ret) {
909	btrfs_abort_log_replay(wc, ret,
910	"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
911	ins.objectid, ins.offset, offset,
912	wc->log_key.objectid, btrfs_root_id(root));
913	goto out;
914	}
915	}
916
917	btrfs_release_path(p: wc->subvol_path);
918
919	if (btrfs_file_extent_compression(eb: wc->log_leaf, s: item)) {
920	csum_start = ins.objectid;
921	csum_end = csum_start + ins.offset;
922	} else {
923	csum_start = ins.objectid + btrfs_file_extent_offset(eb: wc->log_leaf, s: item);
924	csum_end = csum_start + btrfs_file_extent_num_bytes(eb: wc->log_leaf, s: item);
925	}
926
927	ret = btrfs_lookup_csums_list(root: root->log_root, start: csum_start, end: csum_end - `1`,
928	list: &ordered_sums, nowait: false);
929	if (ret < `0`) {
930	btrfs_abort_log_replay(wc, ret,
931	"failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
932	csum_start, csum_end, wc->log_key.objectid,
933	btrfs_root_id(root));
934	goto out;
935	}
936	ret = `0`;
937	/*
938	* Now delete all existing cums in the csum root that cover our range.
939	* We do this because we can have an extent that is completely
940	* referenced by one file extent item and partially referenced by
941	* another file extent item (like after using the clone or extent_same
942	* ioctls). In this case if we end up doing the replay of the one that
943	* partially references the extent first, and we do not do the csum
944	* deletion below, we can get 2 csum items in the csum tree that overlap
945	* each other. For example, imagine our log has the two following file
946	* extent items:
947	*
948	* key (257 EXTENT_DATA 409600)
949	* extent data disk byte 12845056 nr 102400
950	* extent data offset 20480 nr 20480 ram 102400
951	*
952	* key (257 EXTENT_DATA 819200)
953	* extent data disk byte 12845056 nr 102400
954	* extent data offset 0 nr 102400 ram 102400
955	*
956	* Where the second one fully references the 100K extent that starts at
957	* disk byte 12845056, and the log tree has a single csum item that
958	* covers the entire range of the extent:
959	*
960	* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
961	*
962	* After the first file extent item is replayed, the csum tree gets the
963	* following csum item:
964	*
965	* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
966	*
967	* Which covers the 20K sub-range starting at offset 20K of our extent.
968	* Now when we replay the second file extent item, if we do not delete
969	* existing csum items that cover any of its blocks, we end up getting
970	* two csum items in our csum tree that overlap each other:
971	*
972	* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
973	* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
974	*
975	* Which is a problem, because after this anyone trying to lookup for
976	* the checksum of any block of our extent starting at an offset of 40K
977	* or higher, will end up looking at the second csum item only, which
978	* does not contain the checksum for any block starting at offset 40K or
979	* higher of our extent.
980	*/
981	while (!list_empty(head: &ordered_sums)) {
982	struct btrfs_ordered_sum *sums;
983	struct btrfs_root *csum_root;
984
985	sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
986	csum_root = btrfs_csum_root(fs_info, bytenr: sums->logical);
987	if (!ret) {
988	ret = btrfs_del_csums(trans, root: csum_root, bytenr: sums->logical,
989	len: sums->len);
990	if (ret)
991	btrfs_abort_log_replay(wc, ret,
992	"failed to delete csums for range [%llu, %llu) inode %llu root %llu",
993	sums->logical,
994	sums->logical + sums->len,
995	wc->log_key.objectid,
996	btrfs_root_id(root));
997	}
998	if (!ret) {
999	ret = btrfs_csum_file_blocks(trans, root: csum_root, sums);
1000	if (ret)
1001	btrfs_abort_log_replay(wc, ret,
1002	"failed to add csums for range [%llu, %llu) inode %llu root %llu",
1003	sums->logical,
1004	sums->logical + sums->len,
1005	wc->log_key.objectid,
1006	btrfs_root_id(root));
1007	}
1008	list_del(entry: &sums->list);
1009	kfree(objp: sums);
1010	}
1011	if (ret)
1012	goto out;
1013
1014	update_inode:
1015	ret = btrfs_inode_set_file_extent_range(inode, start, len: extent_end - start);
1016	if (ret) {
1017	btrfs_abort_log_replay(wc, ret,
1018	"failed to set file extent range [%llu, %llu) inode %llu root %llu",
1019	start, extent_end, wc->log_key.objectid,
1020	btrfs_root_id(root));
1021	goto out;
1022	}
1023
1024	btrfs_update_inode_bytes(inode, add_bytes: nbytes, del_bytes: drop_args.bytes_found);
1025	ret = btrfs_update_inode(trans, inode);
1026	if (ret)
1027	btrfs_abort_log_replay(wc, ret,
1028	"failed to update inode %llu root %llu",
1029	wc->log_key.objectid, btrfs_root_id(root));
1030	out:
1031	iput(&inode->vfs_inode);
1032	return ret;
1033	}
1034
1035	static int unlink_inode_for_log_replay(struct walk_control *wc,
1036	struct btrfs_inode *dir,
1037	struct btrfs_inode *inode,
1038	const struct fscrypt_str *name)
1039	{
1040	struct btrfs_trans_handle *trans = wc->trans;
1041	int ret;
1042
1043	ret = btrfs_unlink_inode(trans, dir, inode, name);
1044	if (ret) {
1045	btrfs_abort_log_replay(wc, ret,
1046	"failed to unlink inode %llu parent dir %llu name %.*s root %llu",
1047	btrfs_ino(inode), btrfs_ino(dir), name->len,
1048	name->name, btrfs_root_id(inode->root));
1049	return ret;
1050	}
1051	/*
1052	* Whenever we need to check if a name exists or not, we check the
1053	* fs/subvolume tree. So after an unlink we must run delayed items, so
1054	* that future checks for a name during log replay see that the name
1055	* does not exists anymore.
1056	*/
1057	ret = btrfs_run_delayed_items(trans);
1058	if (ret)
1059	btrfs_abort_log_replay(wc, ret,
1060	"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
1061	btrfs_ino(inode), btrfs_ino(dir), name->len,
1062	name->name, btrfs_root_id(inode->root));
1063
1064	return ret;
1065	}
1066
1067	/*
1068	* when cleaning up conflicts between the directory names in the
1069	* subvolume, directory names in the log and directory names in the
1070	* inode back references, we may have to unlink inodes from directories.
1071	*
1072	* This is a helper function to do the unlink of a specific directory
1073	* item
1074	*/
1075	static noinline int drop_one_dir_item(struct walk_control *wc,
1076	struct btrfs_inode *dir,
1077	struct btrfs_dir_item *di)
1078	{
1079	struct btrfs_root *root = dir->root;
1080	struct btrfs_inode *inode;
1081	struct fscrypt_str name;
1082	struct extent_buffer *leaf = wc->subvol_path->nodes[`0`];
1083	struct btrfs_key location;
1084	int ret;
1085
1086	btrfs_dir_item_key_to_cpu(eb: leaf, item: di, cpu_key: &location);
1087	ret = read_alloc_one_name(eb: leaf, start: di + `1`, len: btrfs_dir_name_len(eb: leaf, s: di), name: &name);
1088	if (ret) {
1089	btrfs_abort_log_replay(wc, ret,
1090	"failed to allocate name for dir %llu root %llu",
1091	btrfs_ino(dir), btrfs_root_id(root));
1092	return ret;
1093	}
1094
1095	btrfs_release_path(p: wc->subvol_path);
1096
1097	inode = btrfs_iget_logging(objectid: location.objectid, root);
1098	if (IS_ERR(ptr: inode)) {
1099	ret = PTR_ERR(ptr: inode);
1100	btrfs_abort_log_replay(wc, ret,
1101	"failed to open inode %llu parent dir %llu name %.*s root %llu",
1102	location.objectid, btrfs_ino(dir),
1103	name.len, name.name, btrfs_root_id(root));
1104	inode = NULL;
1105	goto out;
1106	}
1107
1108	ret = link_to_fixup_dir(wc, objectid: location.objectid);
1109	if (ret)
1110	goto out;
1111
1112	ret = unlink_inode_for_log_replay(wc, dir, inode, name: &name);
1113	out:
1114	kfree(objp: name.name);
1115	if (inode)
1116	iput(&inode->vfs_inode);
1117	return ret;
1118	}
1119
1120	/*
1121	* See if a given name and sequence number found in an inode back reference are
1122	* already in a directory and correctly point to this inode.
1123	*
1124	* Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
1125	* exists.
1126	*/
1127	static noinline int inode_in_dir(struct btrfs_root *root,
1128	struct btrfs_path *path,
1129	u64 dirid, u64 objectid, u64 index,
1130	struct fscrypt_str *name)
1131	{
1132	struct btrfs_dir_item *di;
1133	struct btrfs_key location;
1134	int ret = `0`;
1135
1136	di = btrfs_lookup_dir_index_item(NULL, root, path, dir: dirid,
1137	index, name, mod: `0`);
1138	if (IS_ERR(ptr: di)) {
1139	ret = PTR_ERR(ptr: di);
1140	goto out;
1141	} else if (di) {
1142	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &location);
1143	if (location.objectid != objectid)
1144	goto out;
1145	} else {
1146	goto out;
1147	}
1148
1149	btrfs_release_path(p: path);
1150	di = btrfs_lookup_dir_item(NULL, root, path, dir: dirid, name, mod: `0`);
1151	if (IS_ERR(ptr: di)) {
1152	ret = PTR_ERR(ptr: di);
1153	goto out;
1154	} else if (di) {
1155	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &location);
1156	if (location.objectid == objectid)
1157	ret = `1`;
1158	}
1159	out:
1160	btrfs_release_path(p: path);
1161	return ret;
1162	}
1163
1164	/*
1165	* helper function to check a log tree for a named back reference in
1166	* an inode. This is used to decide if a back reference that is
1167	* found in the subvolume conflicts with what we find in the log.
1168	*
1169	* inode backreferences may have multiple refs in a single item,
1170	* during replay we process one reference at a time, and we don't
1171	* want to delete valid links to a file from the subvolume if that
1172	* link is also in the log.
1173	*/
1174	static noinline int backref_in_log(struct btrfs_root *log,
1175	struct btrfs_key *key,
1176	u64 ref_objectid,
1177	const struct fscrypt_str *name)
1178	{
1179	BTRFS_PATH_AUTO_FREE(path);
1180	int ret;
1181
1182	path = btrfs_alloc_path();
1183	if (!path)
1184	return -ENOMEM;
1185
1186	ret = btrfs_search_slot(NULL, root: log, key, p: path, ins_len: `0`, cow: `0`);
1187	if (ret < `0`)
1188	return ret;
1189	if (ret == `1`)
1190	return `0`;
1191
1192	if (key->type == BTRFS_INODE_EXTREF_KEY)
1193	ret = !!btrfs_find_name_in_ext_backref(leaf: path->nodes[`0`],
1194	slot: path->slots[`0`],
1195	ref_objectid, name);
1196	else
1197	ret = !!btrfs_find_name_in_backref(leaf: path->nodes[`0`],
1198	slot: path->slots[`0`], name);
1199	return ret;
1200	}
1201
1202	static int unlink_refs_not_in_log(struct walk_control *wc,
1203	struct btrfs_key *search_key,
1204	struct btrfs_inode *dir,
1205	struct btrfs_inode *inode)
1206	{
1207	struct extent_buffer *leaf = wc->subvol_path->nodes[`0`];
1208	unsigned long ptr;
1209	unsigned long ptr_end;
1210
1211	/*
1212	* Check all the names in this back reference to see if they are in the
1213	* log. If so, we allow them to stay otherwise they must be unlinked as
1214	* a conflict.
1215	*/
1216	ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[`0`]);
1217	ptr_end = ptr + btrfs_item_size(eb: leaf, slot: wc->subvol_path->slots[`0`]);
1218	while (ptr < ptr_end) {
1219	struct fscrypt_str victim_name;
1220	struct btrfs_inode_ref *victim_ref;
1221	int ret;
1222
1223	victim_ref = (struct btrfs_inode_ref *)ptr;
1224	ret = read_alloc_one_name(eb: leaf, start: (victim_ref + `1`),
1225	len: btrfs_inode_ref_name_len(eb: leaf, s: victim_ref),
1226	name: &victim_name);
1227	if (ret) {
1228	btrfs_abort_log_replay(wc, ret,
1229	"failed to allocate name for inode %llu parent dir %llu root %llu",
1230	btrfs_ino(inode), btrfs_ino(dir),
1231	btrfs_root_id(inode->root));
1232	return ret;
1233	}
1234
1235	ret = backref_in_log(log: wc->log, key: search_key, ref_objectid: btrfs_ino(inode: dir), name: &victim_name);
1236	if (ret) {
1237	if (ret < `0`) {
1238	btrfs_abort_log_replay(wc, ret,
1239	"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
1240	btrfs_ino(inode), btrfs_ino(dir),
1241	victim_name.len, victim_name.name,
1242	btrfs_root_id(inode->root));
1243	kfree(objp: victim_name.name);
1244	return ret;
1245	}
1246	kfree(objp: victim_name.name);
1247	ptr = (unsigned long)(victim_ref + `1`) + victim_name.len;
1248	continue;
1249	}
1250
1251	inc_nlink(inode: &inode->vfs_inode);
1252	btrfs_release_path(p: wc->subvol_path);
1253
1254	ret = unlink_inode_for_log_replay(wc, dir, inode, name: &victim_name);
1255	kfree(objp: victim_name.name);
1256	if (ret)
1257	return ret;
1258	return -EAGAIN;
1259	}
1260
1261	return `0`;
1262	}
1263
1264	static int unlink_extrefs_not_in_log(struct walk_control *wc,
1265	struct btrfs_key *search_key,
1266	struct btrfs_inode *dir,
1267	struct btrfs_inode *inode)
1268	{
1269	struct extent_buffer *leaf = wc->subvol_path->nodes[`0`];
1270	const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[`0`]);
1271	const u32 item_size = btrfs_item_size(eb: leaf, slot: wc->subvol_path->slots[`0`]);
1272	u32 cur_offset = `0`;
1273
1274	while (cur_offset < item_size) {
1275	struct btrfs_root *log_root = wc->log;
1276	struct btrfs_inode_extref *extref;
1277	struct fscrypt_str victim_name;
1278	int ret;
1279
1280	extref = (struct btrfs_inode_extref *)(base + cur_offset);
1281	victim_name.len = btrfs_inode_extref_name_len(eb: leaf, s: extref);
1282
1283	if (btrfs_inode_extref_parent(eb: leaf, s: extref) != btrfs_ino(inode: dir))
1284	goto next;
1285
1286	ret = read_alloc_one_name(eb: leaf, start: &extref->name, len: victim_name.len,
1287	name: &victim_name);
1288	if (ret) {
1289	btrfs_abort_log_replay(wc, ret,
1290	"failed to allocate name for inode %llu parent dir %llu root %llu",
1291	btrfs_ino(inode), btrfs_ino(dir),
1292	btrfs_root_id(inode->root));
1293	return ret;
1294	}
1295
1296	search_key->objectid = btrfs_ino(inode);
1297	search_key->type = BTRFS_INODE_EXTREF_KEY;
1298	search_key->offset = btrfs_extref_hash(parent_objectid: btrfs_ino(inode: dir),
1299	name: victim_name.name,
1300	len: victim_name.len);
1301	ret = backref_in_log(log: log_root, key: search_key, ref_objectid: btrfs_ino(inode: dir), name: &victim_name);
1302	if (ret) {
1303	if (ret < `0`) {
1304	btrfs_abort_log_replay(wc, ret,
1305	"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
1306	btrfs_ino(inode), btrfs_ino(dir),
1307	victim_name.len, victim_name.name,
1308	btrfs_root_id(inode->root));
1309	kfree(objp: victim_name.name);
1310	return ret;
1311	}
1312	kfree(objp: victim_name.name);
1313	next:
1314	cur_offset += victim_name.len + sizeof(*extref);
1315	continue;
1316	}
1317
1318	inc_nlink(inode: &inode->vfs_inode);
1319	btrfs_release_path(p: wc->subvol_path);
1320
1321	ret = unlink_inode_for_log_replay(wc, dir, inode, name: &victim_name);
1322	kfree(objp: victim_name.name);
1323	if (ret)
1324	return ret;
1325	return -EAGAIN;
1326	}
1327
1328	return `0`;
1329	}
1330
1331	static inline int __add_inode_ref(struct walk_control *wc,
1332	struct btrfs_inode *dir,
1333	struct btrfs_inode *inode,
1334	u64 ref_index, struct fscrypt_str *name)
1335	{
1336	int ret;
1337	struct btrfs_trans_handle *trans = wc->trans;
1338	struct btrfs_root *root = wc->root;
1339	struct btrfs_dir_item *di;
1340	struct btrfs_key search_key;
1341	struct btrfs_inode_extref *extref;
1342
1343	again:
1344	/ Search old style refs /
1345	search_key.objectid = btrfs_ino(inode);
1346	search_key.type = BTRFS_INODE_REF_KEY;
1347	search_key.offset = btrfs_ino(inode: dir);
1348	ret = btrfs_search_slot(NULL, root, key: &search_key, p: wc->subvol_path, ins_len: `0`, cow: `0`);
1349	if (ret < `0`) {
1350	btrfs_abort_log_replay(wc, ret,
1351	"failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
1352	BTRFS_KEY_FMT_VALUE(&search_key),
1353	btrfs_root_id(root));
1354	return ret;
1355	} else if (ret == `0`) {
1356	/*
1357	* Are we trying to overwrite a back ref for the root directory?
1358	* If so, we're done.
1359	*/
1360	if (search_key.objectid == search_key.offset)
1361	return `1`;
1362
1363	ret = unlink_refs_not_in_log(wc, search_key: &search_key, dir, inode);
1364	if (ret == -EAGAIN)
1365	goto again;
1366	else if (ret)
1367	return ret;
1368	}
1369	btrfs_release_path(p: wc->subvol_path);
1370
1371	/ Same search but for extended refs /
1372	extref = btrfs_lookup_inode_extref(root, path: wc->subvol_path, name,
1373	inode_objectid: btrfs_ino(inode), ref_objectid: btrfs_ino(inode: dir));
1374	if (IS_ERR(ptr: extref)) {
1375	return PTR_ERR(ptr: extref);
1376	} else if (extref) {
1377	ret = unlink_extrefs_not_in_log(wc, search_key: &search_key, dir, inode);
1378	if (ret == -EAGAIN)
1379	goto again;
1380	else if (ret)
1381	return ret;
1382	}
1383	btrfs_release_path(p: wc->subvol_path);
1384
1385	/ look for a conflicting sequence number /
1386	di = btrfs_lookup_dir_index_item(trans, root, path: wc->subvol_path, dir: btrfs_ino(inode: dir),
1387	index: ref_index, name, mod: `0`);
1388	if (IS_ERR(ptr: di)) {
1389	ret = PTR_ERR(ptr: di);
1390	btrfs_abort_log_replay(wc, ret,
1391	"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
1392	btrfs_ino(dir), ref_index, name->len,
1393	name->name, btrfs_root_id(root));
1394	return ret;
1395	} else if (di) {
1396	ret = drop_one_dir_item(wc, dir, di);
1397	if (ret)
1398	return ret;
1399	}
1400	btrfs_release_path(p: wc->subvol_path);
1401
1402	/ look for a conflicting name /
1403	di = btrfs_lookup_dir_item(trans, root, path: wc->subvol_path, dir: btrfs_ino(inode: dir), name, mod: `0`);
1404	if (IS_ERR(ptr: di)) {
1405	ret = PTR_ERR(ptr: di);
1406	btrfs_abort_log_replay(wc, ret,
1407	"failed to lookup dir item for dir %llu name %.*s root %llu",
1408	btrfs_ino(dir), name->len, name->name,
1409	btrfs_root_id(root));
1410	return ret;
1411	} else if (di) {
1412	ret = drop_one_dir_item(wc, dir, di);
1413	if (ret)
1414	return ret;
1415	}
1416	btrfs_release_path(p: wc->subvol_path);
1417
1418	return `0`;
1419	}
1420
1421	static int extref_get_fields(struct extent_buffer eb, unsigned* long ref_ptr,
1422	struct fscrypt_str name, u64 index,
1423	u64 *parent_objectid)
1424	{
1425	struct btrfs_inode_extref *extref;
1426	int ret;
1427
1428	extref = (struct btrfs_inode_extref *)ref_ptr;
1429
1430	ret = read_alloc_one_name(eb, start: &extref->name,
1431	len: btrfs_inode_extref_name_len(eb, s: extref), name);
1432	if (ret)
1433	return ret;
1434
1435	if (index)
1436	*index = btrfs_inode_extref_index(eb, s: extref);
1437	if (parent_objectid)
1438	*parent_objectid = btrfs_inode_extref_parent(eb, s: extref);
1439
1440	return `0`;
1441	}
1442
1443	static int ref_get_fields(struct extent_buffer eb, unsigned* long ref_ptr,
1444	struct fscrypt_str name, u64 index)
1445	{
1446	struct btrfs_inode_ref *ref;
1447	int ret;
1448
1449	ref = (struct btrfs_inode_ref *)ref_ptr;
1450
1451	ret = read_alloc_one_name(eb, start: ref + `1`, len: btrfs_inode_ref_name_len(eb, s: ref),
1452	name);
1453	if (ret)
1454	return ret;
1455
1456	if (index)
1457	*index = btrfs_inode_ref_index(eb, s: ref);
1458
1459	return `0`;
1460	}
1461
1462	/*
1463	* Take an inode reference item from the log tree and iterate all names from the
1464	* inode reference item in the subvolume tree with the same key (if it exists).
1465	* For any name that is not in the inode reference item from the log tree, do a
1466	* proper unlink of that name (that is, remove its entry from the inode
1467	* reference item and both dir index keys).
1468	*/
1469	static int unlink_old_inode_refs(struct walk_control wc, struct* btrfs_inode *inode)
1470	{
1471	struct btrfs_root *root = wc->root;
1472	int ret;
1473	unsigned long ref_ptr;
1474	unsigned long ref_end;
1475	struct extent_buffer *eb;
1476
1477	again:
1478	btrfs_release_path(p: wc->subvol_path);
1479	ret = btrfs_search_slot(NULL, root, key: &wc->log_key, p: wc->subvol_path, ins_len: `0`, cow: `0`);
1480	if (ret > `0`) {
1481	ret = `0`;
1482	goto out;
1483	}
1484	if (ret < `0`) {
1485	btrfs_abort_log_replay(wc, ret,
1486	"failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
1487	BTRFS_KEY_FMT_VALUE(&wc->log_key),
1488	btrfs_root_id(root));
1489	goto out;
1490	}
1491
1492	eb = wc->subvol_path->nodes[`0`];
1493	ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[`0`]);
1494	ref_end = ref_ptr + btrfs_item_size(eb, slot: wc->subvol_path->slots[`0`]);
1495	while (ref_ptr < ref_end) {
1496	struct fscrypt_str name;
1497	u64 parent_id;
1498
1499	if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
1500	ret = extref_get_fields(eb, ref_ptr, name: &name,
1501	NULL, parent_objectid: &parent_id);
1502	if (ret) {
1503	btrfs_abort_log_replay(wc, ret,
1504	"failed to get extref details for inode %llu root %llu",
1505	btrfs_ino(inode),
1506	btrfs_root_id(root));
1507	goto out;
1508	}
1509	} else {
1510	parent_id = wc->log_key.offset;
1511	ret = ref_get_fields(eb, ref_ptr, name: &name, NULL);
1512	if (ret) {
1513	btrfs_abort_log_replay(wc, ret,
1514	"failed to get ref details for inode %llu parent_id %llu root %llu",
1515	btrfs_ino(inode), parent_id,
1516	btrfs_root_id(root));
1517	goto out;
1518	}
1519	}
1520
1521	if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
1522	ret = !!btrfs_find_name_in_ext_backref(leaf: wc->log_leaf, slot: wc->log_slot,
1523	ref_objectid: parent_id, name: &name);
1524	else
1525	ret = !!btrfs_find_name_in_backref(leaf: wc->log_leaf, slot: wc->log_slot,
1526	name: &name);
1527
1528	if (!ret) {
1529	struct btrfs_inode *dir;
1530
1531	btrfs_release_path(p: wc->subvol_path);
1532	dir = btrfs_iget_logging(objectid: parent_id, root);
1533	if (IS_ERR(ptr: dir)) {
1534	ret = PTR_ERR(ptr: dir);
1535	kfree(objp: name.name);
1536	btrfs_abort_log_replay(wc, ret,
1537	"failed to lookup dir inode %llu root %llu",
1538	parent_id, btrfs_root_id(root));
1539	goto out;
1540	}
1541	ret = unlink_inode_for_log_replay(wc, dir, inode, name: &name);
1542	kfree(objp: name.name);
1543	iput(&dir->vfs_inode);
1544	if (ret)
1545	goto out;
1546	goto again;
1547	}
1548
1549	kfree(objp: name.name);
1550	ref_ptr += name.len;
1551	if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
1552	ref_ptr += sizeof(struct btrfs_inode_extref);
1553	else
1554	ref_ptr += sizeof(struct btrfs_inode_ref);
1555	}
1556	ret = `0`;
1557	out:
1558	btrfs_release_path(p: wc->subvol_path);
1559	return ret;
1560	}
1561
1562	/*
1563	* Replay one inode back reference item found in the log tree.
1564	* Path is for temporary use by this function (it should be released on return).
1565	*/
1566	static noinline int add_inode_ref(struct walk_control *wc)
1567	{
1568	struct btrfs_trans_handle *trans = wc->trans;
1569	struct btrfs_root *root = wc->root;
1570	struct btrfs_inode *dir = NULL;
1571	struct btrfs_inode *inode = NULL;
1572	unsigned long ref_ptr;
1573	unsigned long ref_end;
1574	struct fscrypt_str name = { `0` };
1575	int ret;
1576	const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
1577	u64 parent_objectid;
1578	u64 inode_objectid;
1579	u64 ref_index = `0`;
1580	int ref_struct_size;
1581
1582	ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
1583	ref_end = ref_ptr + btrfs_item_size(eb: wc->log_leaf, slot: wc->log_slot);
1584
1585	if (is_extref_item) {
1586	struct btrfs_inode_extref *r;
1587
1588	ref_struct_size = sizeof(struct btrfs_inode_extref);
1589	r = (struct btrfs_inode_extref *)ref_ptr;
1590	parent_objectid = btrfs_inode_extref_parent(eb: wc->log_leaf, s: r);
1591	} else {
1592	ref_struct_size = sizeof(struct btrfs_inode_ref);
1593	parent_objectid = wc->log_key.offset;
1594	}
1595	inode_objectid = wc->log_key.objectid;
1596
1597	/*
1598	* it is possible that we didn't log all the parent directories
1599	* for a given inode. If we don't find the dir, just don't
1600	* copy the back ref in. The link count fixup code will take
1601	* care of the rest
1602	*/
1603	dir = btrfs_iget_logging(objectid: parent_objectid, root);
1604	if (IS_ERR(ptr: dir)) {
1605	ret = PTR_ERR(ptr: dir);
1606	if (ret == -ENOENT)
1607	ret = `0`;
1608	else
1609	btrfs_abort_log_replay(wc, ret,
1610	"failed to lookup dir inode %llu root %llu",
1611	parent_objectid, btrfs_root_id(root));
1612	dir = NULL;
1613	goto out;
1614	}
1615
1616	inode = btrfs_iget_logging(objectid: inode_objectid, root);
1617	if (IS_ERR(ptr: inode)) {
1618	ret = PTR_ERR(ptr: inode);
1619	btrfs_abort_log_replay(wc, ret,
1620	"failed to lookup inode %llu root %llu",
1621	inode_objectid, btrfs_root_id(root));
1622	inode = NULL;
1623	goto out;
1624	}
1625
1626	while (ref_ptr < ref_end) {
1627	if (is_extref_item) {
1628	ret = extref_get_fields(eb: wc->log_leaf, ref_ptr, name: &name,
1629	index: &ref_index, parent_objectid: &parent_objectid);
1630	if (ret) {
1631	btrfs_abort_log_replay(wc, ret,
1632	"failed to get extref details for inode %llu root %llu",
1633	btrfs_ino(inode),
1634	btrfs_root_id(root));
1635	goto out;
1636	}
1637	/*
1638	* parent object can change from one array
1639	* item to another.
1640	*/
1641	if (!dir) {
1642	dir = btrfs_iget_logging(objectid: parent_objectid, root);
1643	if (IS_ERR(ptr: dir)) {
1644	ret = PTR_ERR(ptr: dir);
1645	dir = NULL;
1646	/*
1647	* A new parent dir may have not been
1648	* logged and not exist in the subvolume
1649	* tree, see the comment above before
1650	* the loop when getting the first
1651	* parent dir.
1652	*/
1653	if (ret == -ENOENT) {
1654	/*
1655	* The next extref may refer to
1656	* another parent dir that
1657	* exists, so continue.
1658	*/
1659	ret = `0`;
1660	goto next;
1661	} else {
1662	btrfs_abort_log_replay(wc, ret,
1663	"failed to lookup dir inode %llu root %llu",
1664	parent_objectid,
1665	btrfs_root_id(root));
1666	}
1667	goto out;
1668	}
1669	}
1670	} else {
1671	ret = ref_get_fields(eb: wc->log_leaf, ref_ptr, name: &name, index: &ref_index);
1672	if (ret) {
1673	btrfs_abort_log_replay(wc, ret,
1674	"failed to get ref details for inode %llu parent_objectid %llu root %llu",
1675	btrfs_ino(inode),
1676	parent_objectid,
1677	btrfs_root_id(root));
1678	goto out;
1679	}
1680	}
1681
1682	ret = inode_in_dir(root, path: wc->subvol_path, dirid: btrfs_ino(inode: dir),
1683	objectid: btrfs_ino(inode), index: ref_index, name: &name);
1684	if (ret < `0`) {
1685	btrfs_abort_log_replay(wc, ret,
1686	"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
1687	btrfs_ino(inode), btrfs_ino(dir),
1688	ref_index, name.len, name.name,
1689	btrfs_root_id(root));
1690	goto out;
1691	} else if (ret == `0`) {
1692	/*
1693	* look for a conflicting back reference in the
1694	* metadata. if we find one we have to unlink that name
1695	* of the file before we add our new link. Later on, we
1696	* overwrite any existing back reference, and we don't
1697	* want to create dangling pointers in the directory.
1698	*/
1699	ret = __add_inode_ref(wc, dir, inode, ref_index, name: &name);
1700	if (ret) {
1701	if (ret == `1`)
1702	ret = `0`;
1703	goto out;
1704	}
1705
1706	/ insert our name /
1707	ret = btrfs_add_link(trans, parent_inode: dir, inode, name: &name, add_backref: `0`, index: ref_index);
1708	if (ret) {
1709	btrfs_abort_log_replay(wc, ret,
1710	"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
1711	btrfs_ino(inode),
1712	btrfs_ino(dir), ref_index,
1713	name.len, name.name,
1714	btrfs_root_id(root));
1715	goto out;
1716	}
1717
1718	ret = btrfs_update_inode(trans, inode);
1719	if (ret) {
1720	btrfs_abort_log_replay(wc, ret,
1721	"failed to update inode %llu root %llu",
1722	btrfs_ino(inode),
1723	btrfs_root_id(root));
1724	goto out;
1725	}
1726	}
1727	/ Else, ret == 1, we already have a perfect match, we're done. /
1728
1729	next:
1730	ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1731	kfree(objp: name.name);
1732	name.name = NULL;
1733	if (is_extref_item && dir) {
1734	iput(&dir->vfs_inode);
1735	dir = NULL;
1736	}
1737	}
1738
1739	/*
1740	* Before we overwrite the inode reference item in the subvolume tree
1741	* with the item from the log tree, we must unlink all names from the
1742	* parent directory that are in the subvolume's tree inode reference
1743	* item, otherwise we end up with an inconsistent subvolume tree where
1744	* dir index entries exist for a name but there is no inode reference
1745	* item with the same name.
1746	*/
1747	ret = unlink_old_inode_refs(wc, inode);
1748	if (ret)
1749	goto out;
1750
1751	/ finally write the back reference in the inode /
1752	ret = overwrite_item(wc);
1753	out:
1754	btrfs_release_path(p: wc->subvol_path);
1755	kfree(objp: name.name);
1756	if (dir)
1757	iput(&dir->vfs_inode);
1758	if (inode)
1759	iput(&inode->vfs_inode);
1760	return ret;
1761	}
1762
1763	static int count_inode_extrefs(struct btrfs_inode inode, struct* btrfs_path *path)
1764	{
1765	int ret = `0`;
1766	int name_len;
1767	unsigned int nlink = `0`;
1768	u32 item_size;
1769	u32 cur_offset = `0`;
1770	u64 inode_objectid = btrfs_ino(inode);
1771	u64 offset = `0`;
1772	unsigned long ptr;
1773	struct btrfs_inode_extref *extref;
1774	struct extent_buffer *leaf;
1775
1776	while (`1`) {
1777	ret = btrfs_find_one_extref(root: inode->root, inode_objectid, start_off: offset,
1778	path, ret_extref: &extref, found_off: &offset);
1779	if (ret)
1780	break;
1781
1782	leaf = path->nodes[`0`];
1783	item_size = btrfs_item_size(eb: leaf, slot: path->slots[`0`]);
1784	ptr = btrfs_item_ptr_offset(leaf, path->slots[`0`]);
1785	cur_offset = `0`;
1786
1787	while (cur_offset < item_size) {
1788	extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1789	name_len = btrfs_inode_extref_name_len(eb: leaf, s: extref);
1790
1791	nlink++;
1792
1793	cur_offset += name_len + sizeof(*extref);
1794	}
1795
1796	offset++;
1797	btrfs_release_path(p: path);
1798	}
1799	btrfs_release_path(p: path);
1800
1801	if (ret < `0` && ret != -ENOENT)
1802	return ret;
1803	return nlink;
1804	}
1805
1806	static int count_inode_refs(struct btrfs_inode inode, struct* btrfs_path *path)
1807	{
1808	int ret;
1809	struct btrfs_key key;
1810	unsigned int nlink = `0`;
1811	unsigned long ptr;
1812	unsigned long ptr_end;
1813	int name_len;
1814	u64 ino = btrfs_ino(inode);
1815
1816	key.objectid = ino;
1817	key.type = BTRFS_INODE_REF_KEY;
1818	key.offset = (u64)-`1`;
1819
1820	while (`1`) {
1821	ret = btrfs_search_slot(NULL, root: inode->root, key: &key, p: path, ins_len: `0`, cow: `0`);
1822	if (ret < `0`)
1823	break;
1824	if (ret > `0`) {
1825	if (path->slots[`0`] == `0`)
1826	break;
1827	path->slots[`0`]--;
1828	}
1829	process_slot:
1830	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key,
1831	nr: path->slots[`0`]);
1832	if (key.objectid != ino \|\|
1833	key.type != BTRFS_INODE_REF_KEY)
1834	break;
1835	ptr = btrfs_item_ptr_offset(path->nodes[`0`], path->slots[`0`]);
1836	ptr_end = ptr + btrfs_item_size(eb: path->nodes[`0`],
1837	slot: path->slots[`0`]);
1838	while (ptr < ptr_end) {
1839	struct btrfs_inode_ref *ref;
1840
1841	ref = (struct btrfs_inode_ref *)ptr;
1842	name_len = btrfs_inode_ref_name_len(eb: path->nodes[`0`],
1843	s: ref);
1844	ptr = (unsigned long)(ref + `1`) + name_len;
1845	nlink++;
1846	}
1847
1848	if (key.offset == `0`)
1849	break;
1850	if (path->slots[`0`] > `0`) {
1851	path->slots[`0`]--;
1852	goto process_slot;
1853	}
1854	key.offset--;
1855	btrfs_release_path(p: path);
1856	}
1857	btrfs_release_path(p: path);
1858
1859	return nlink;
1860	}
1861
1862	/*
1863	* There are a few corners where the link count of the file can't
1864	* be properly maintained during replay. So, instead of adding
1865	* lots of complexity to the log code, we just scan the backrefs
1866	* for any file that has been through replay.
1867	*
1868	* The scan will update the link count on the inode to reflect the
1869	* number of back refs found. If it goes down to zero, the iput
1870	* will free the inode.
1871	*/
1872	static noinline int fixup_inode_link_count(struct walk_control *wc,
1873	struct btrfs_inode *inode)
1874	{
1875	struct btrfs_trans_handle *trans = wc->trans;
1876	struct btrfs_root *root = inode->root;
1877	int ret;
1878	u64 nlink = `0`;
1879	const u64 ino = btrfs_ino(inode);
1880
1881	ret = count_inode_refs(inode, path: wc->subvol_path);
1882	if (ret < `0`)
1883	goto out;
1884
1885	nlink = ret;
1886
1887	ret = count_inode_extrefs(inode, path: wc->subvol_path);
1888	if (ret < `0`)
1889	goto out;
1890
1891	nlink += ret;
1892
1893	ret = `0`;
1894
1895	if (nlink != inode->vfs_inode.i_nlink) {
1896	set_nlink(inode: &inode->vfs_inode, nlink);
1897	ret = btrfs_update_inode(trans, inode);
1898	if (ret)
1899	goto out;
1900	}
1901	if (S_ISDIR(inode->vfs_inode.i_mode))
1902	inode->index_cnt = (u64)-`1`;
1903
1904	if (inode->vfs_inode.i_nlink == `0`) {
1905	if (S_ISDIR(inode->vfs_inode.i_mode)) {
1906	ret = replay_dir_deletes(wc, dirid: ino, del_all: true);
1907	if (ret)
1908	goto out;
1909	}
1910	ret = btrfs_insert_orphan_item(trans, root, offset: ino);
1911	if (ret == -EEXIST)
1912	ret = `0`;
1913	}
1914
1915	out:
1916	btrfs_release_path(p: wc->subvol_path);
1917	return ret;
1918	}
1919
1920	static noinline int fixup_inode_link_counts(struct walk_control *wc)
1921	{
1922	int ret;
1923	struct btrfs_key key;
1924
1925	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1926	key.type = BTRFS_ORPHAN_ITEM_KEY;
1927	key.offset = (u64)-`1`;
1928	while (`1`) {
1929	struct btrfs_trans_handle *trans = wc->trans;
1930	struct btrfs_root *root = wc->root;
1931	struct btrfs_inode *inode;
1932
1933	ret = btrfs_search_slot(trans, root, key: &key, p: wc->subvol_path, ins_len: -`1`, cow: `1`);
1934	if (ret < `0`)
1935	break;
1936
1937	if (ret == `1`) {
1938	ret = `0`;
1939	if (wc->subvol_path->slots[`0`] == `0`)
1940	break;
1941	wc->subvol_path->slots[`0`]--;
1942	}
1943
1944	btrfs_item_key_to_cpu(eb: wc->subvol_path->nodes[`0`], cpu_key: &key, nr: wc->subvol_path->slots[`0`]);
1945	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
1946	key.type != BTRFS_ORPHAN_ITEM_KEY)
1947	break;
1948
1949	ret = btrfs_del_item(trans, root, path: wc->subvol_path);
1950	if (ret)
1951	break;
1952
1953	btrfs_release_path(p: wc->subvol_path);
1954	inode = btrfs_iget_logging(objectid: key.offset, root);
1955	if (IS_ERR(ptr: inode)) {
1956	ret = PTR_ERR(ptr: inode);
1957	break;
1958	}
1959
1960	ret = fixup_inode_link_count(wc, inode);
1961	iput(&inode->vfs_inode);
1962	if (ret)
1963	break;
1964
1965	/*
1966	* fixup on a directory may create new entries,
1967	* make sure we always look for the highest possible
1968	* offset
1969	*/
1970	key.offset = (u64)-`1`;
1971	}
1972	btrfs_release_path(p: wc->subvol_path);
1973	return ret;
1974	}
1975
1976
1977	/*
1978	* record a given inode in the fixup dir so we can check its link
1979	* count when replay is done. The link count is incremented here
1980	* so the inode won't go away until we check it
1981	*/
1982	static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
1983	{
1984	struct btrfs_trans_handle *trans = wc->trans;
1985	struct btrfs_root *root = wc->root;
1986	struct btrfs_key key;
1987	int ret = `0`;
1988	struct btrfs_inode *inode;
1989	struct inode *vfs_inode;
1990
1991	inode = btrfs_iget_logging(objectid, root);
1992	if (IS_ERR(ptr: inode)) {
1993	ret = PTR_ERR(ptr: inode);
1994	btrfs_abort_log_replay(wc, ret,
1995	"failed to lookup inode %llu root %llu",
1996	objectid, btrfs_root_id(root));
1997	return ret;
1998	}
1999
2000	vfs_inode = &inode->vfs_inode;
2001	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
2002	key.type = BTRFS_ORPHAN_ITEM_KEY;
2003	key.offset = objectid;
2004
2005	ret = btrfs_insert_empty_item(trans, root, path: wc->subvol_path, key: &key, data_size: `0`);
2006
2007	btrfs_release_path(p: wc->subvol_path);
2008	if (ret == `0`) {
2009	if (!vfs_inode->i_nlink)
2010	set_nlink(inode: vfs_inode, nlink: `1`);
2011	else
2012	inc_nlink(inode: vfs_inode);
2013	ret = btrfs_update_inode(trans, inode);
2014	if (ret)
2015	btrfs_abort_log_replay(wc, ret,
2016	"failed to update inode %llu root %llu",
2017	objectid, btrfs_root_id(root));
2018	} else if (ret == -EEXIST) {
2019	ret = `0`;
2020	} else {
2021	btrfs_abort_log_replay(wc, ret,
2022	"failed to insert fixup item for inode %llu root %llu",
2023	objectid, btrfs_root_id(root));
2024	}
2025	iput(vfs_inode);
2026
2027	return ret;
2028	}
2029
2030	/*
2031	* when replaying the log for a directory, we only insert names
2032	* for inodes that actually exist. This means an fsync on a directory
2033	* does not implicitly fsync all the new files in it
2034	*/
2035	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
2036	struct btrfs_root *root,
2037	u64 dirid, u64 index,
2038	const struct fscrypt_str *name,
2039	struct btrfs_key *location)
2040	{
2041	struct btrfs_inode *inode;
2042	struct btrfs_inode *dir;
2043	int ret;
2044
2045	inode = btrfs_iget_logging(objectid: location->objectid, root);
2046	if (IS_ERR(ptr: inode))
2047	return PTR_ERR(ptr: inode);
2048
2049	dir = btrfs_iget_logging(objectid: dirid, root);
2050	if (IS_ERR(ptr: dir)) {
2051	iput(&inode->vfs_inode);
2052	return PTR_ERR(ptr: dir);
2053	}
2054
2055	ret = btrfs_add_link(trans, parent_inode: dir, inode, name, add_backref: `1`, index);
2056
2057	/ FIXME, put inode into FIXUP list /
2058
2059	iput(&inode->vfs_inode);
2060	iput(&dir->vfs_inode);
2061	return ret;
2062	}
2063
2064	static int delete_conflicting_dir_entry(struct walk_control *wc,
2065	struct btrfs_inode *dir,
2066	struct btrfs_dir_item *dst_di,
2067	const struct btrfs_key *log_key,
2068	u8 log_flags,
2069	bool exists)
2070	{
2071	struct btrfs_key found_key;
2072
2073	btrfs_dir_item_key_to_cpu(eb: wc->subvol_path->nodes[`0`], item: dst_di, cpu_key: &found_key);
2074	/ The existing dentry points to the same inode, don't delete it. /
2075	if (found_key.objectid == log_key->objectid &&
2076	found_key.type == log_key->type &&
2077	found_key.offset == log_key->offset &&
2078	btrfs_dir_flags(eb: wc->subvol_path->nodes[`0`], s: dst_di) == log_flags)
2079	return `1`;
2080
2081	/*
2082	* Don't drop the conflicting directory entry if the inode for the new
2083	* entry doesn't exist.
2084	*/
2085	if (!exists)
2086	return `0`;
2087
2088	return drop_one_dir_item(wc, dir, di: dst_di);
2089	}
2090
2091	/*
2092	* take a single entry in a log directory item and replay it into
2093	* the subvolume.
2094	*
2095	* if a conflicting item exists in the subdirectory already,
2096	* the inode it points to is unlinked and put into the link count
2097	* fix up tree.
2098	*
2099	* If a name from the log points to a file or directory that does
2100	* not exist in the FS, it is skipped. fsyncs on directories
2101	* do not force down inodes inside that directory, just changes to the
2102	* names or unlinks in a directory.
2103	*
2104	* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
2105	* non-existing inode) and 1 if the name was replayed.
2106	*/
2107	static noinline int replay_one_name(struct walk_control wc, struct* btrfs_dir_item *di)
2108	{
2109	struct btrfs_trans_handle *trans = wc->trans;
2110	struct btrfs_root *root = wc->root;
2111	struct fscrypt_str name = { `0` };
2112	struct btrfs_dir_item *dir_dst_di;
2113	struct btrfs_dir_item *index_dst_di;
2114	bool dir_dst_matches = false;
2115	bool index_dst_matches = false;
2116	struct btrfs_key log_key;
2117	struct btrfs_key search_key;
2118	struct btrfs_inode *dir;
2119	u8 log_flags;
2120	bool exists;
2121	int ret;
2122	bool update_size = true;
2123	bool name_added = false;
2124
2125	dir = btrfs_iget_logging(objectid: wc->log_key.objectid, root);
2126	if (IS_ERR(ptr: dir)) {
2127	ret = PTR_ERR(ptr: dir);
2128	btrfs_abort_log_replay(wc, ret,
2129	"failed to lookup dir inode %llu root %llu",
2130	wc->log_key.objectid, btrfs_root_id(root));
2131	return ret;
2132	}
2133
2134	ret = read_alloc_one_name(eb: wc->log_leaf, start: di + `1`,
2135	len: btrfs_dir_name_len(eb: wc->log_leaf, s: di), name: &name);
2136	if (ret) {
2137	btrfs_abort_log_replay(wc, ret,
2138	"failed to allocate name for dir %llu root %llu",
2139	btrfs_ino(dir), btrfs_root_id(root));
2140	goto out;
2141	}
2142
2143	log_flags = btrfs_dir_flags(eb: wc->log_leaf, s: di);
2144	btrfs_dir_item_key_to_cpu(eb: wc->log_leaf, item: di, cpu_key: &log_key);
2145	ret = btrfs_lookup_inode(trans, root, path: wc->subvol_path, location: &log_key, mod: `0`);
2146	btrfs_release_path(p: wc->subvol_path);
2147	if (ret < `0`) {
2148	btrfs_abort_log_replay(wc, ret,
2149	"failed to lookup inode %llu root %llu",
2150	log_key.objectid, btrfs_root_id(root));
2151	goto out;
2152	}
2153	exists = (ret == `0`);
2154	ret = `0`;
2155
2156	dir_dst_di = btrfs_lookup_dir_item(trans, root, path: wc->subvol_path,
2157	dir: wc->log_key.objectid, name: &name, mod: `1`);
2158	if (IS_ERR(ptr: dir_dst_di)) {
2159	ret = PTR_ERR(ptr: dir_dst_di);
2160	btrfs_abort_log_replay(wc, ret,
2161	"failed to lookup dir item for dir %llu name %.*s root %llu",
2162	wc->log_key.objectid, name.len, name.name,
2163	btrfs_root_id(root));
2164	goto out;
2165	} else if (dir_dst_di) {
2166	ret = delete_conflicting_dir_entry(wc, dir, dst_di: dir_dst_di,
2167	log_key: &log_key, log_flags, exists);
2168	if (ret < `0`) {
2169	btrfs_abort_log_replay(wc, ret,
2170	"failed to delete conflicting entry for dir %llu name %.*s root %llu",
2171	btrfs_ino(dir), name.len, name.name,
2172	btrfs_root_id(root));
2173	goto out;
2174	}
2175	dir_dst_matches = (ret == `1`);
2176	}
2177
2178	btrfs_release_path(p: wc->subvol_path);
2179
2180	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path: wc->subvol_path,
2181	dir: wc->log_key.objectid,
2182	index: wc->log_key.offset, name: &name, mod: `1`);
2183	if (IS_ERR(ptr: index_dst_di)) {
2184	ret = PTR_ERR(ptr: index_dst_di);
2185	btrfs_abort_log_replay(wc, ret,
2186	"failed to lookup dir index item for dir %llu name %.*s root %llu",
2187	wc->log_key.objectid, name.len, name.name,
2188	btrfs_root_id(root));
2189	goto out;
2190	} else if (index_dst_di) {
2191	ret = delete_conflicting_dir_entry(wc, dir, dst_di: index_dst_di,
2192	log_key: &log_key, log_flags, exists);
2193	if (ret < `0`) {
2194	btrfs_abort_log_replay(wc, ret,
2195	"failed to delete conflicting entry for dir %llu name %.*s root %llu",
2196	btrfs_ino(dir), name.len, name.name,
2197	btrfs_root_id(root));
2198	goto out;
2199	}
2200	index_dst_matches = (ret == `1`);
2201	}
2202
2203	btrfs_release_path(p: wc->subvol_path);
2204
2205	if (dir_dst_matches && index_dst_matches) {
2206	ret = `0`;
2207	update_size = false;
2208	goto out;
2209	}
2210
2211	/*
2212	* Check if the inode reference exists in the log for the given name,
2213	* inode and parent inode
2214	*/
2215	search_key.objectid = log_key.objectid;
2216	search_key.type = BTRFS_INODE_REF_KEY;
2217	search_key.offset = wc->log_key.objectid;
2218	ret = backref_in_log(log: root->log_root, key: &search_key, ref_objectid: `0`, name: &name);
2219	if (ret < `0`) {
2220	btrfs_abort_log_replay(wc, ret,
2221	"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
2222	search_key.objectid, btrfs_ino(dir),
2223	name.len, name.name, btrfs_root_id(root));
2224	goto out;
2225	} else if (ret) {
2226	/ The dentry will be added later. /
2227	ret = `0`;
2228	update_size = false;
2229	goto out;
2230	}
2231
2232	search_key.objectid = log_key.objectid;
2233	search_key.type = BTRFS_INODE_EXTREF_KEY;
2234	search_key.offset = btrfs_extref_hash(parent_objectid: wc->log_key.objectid, name: name.name, len: name.len);
2235	ret = backref_in_log(log: root->log_root, key: &search_key, ref_objectid: wc->log_key.objectid, name: &name);
2236	if (ret < `0`) {
2237	btrfs_abort_log_replay(wc, ret,
2238	"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
2239	search_key.objectid, btrfs_ino(dir),
2240	name.len, name.name, btrfs_root_id(root));
2241	goto out;
2242	} else if (ret) {
2243	/ The dentry will be added later. /
2244	ret = `0`;
2245	update_size = false;
2246	goto out;
2247	}
2248	ret = insert_one_name(trans, root, dirid: wc->log_key.objectid, index: wc->log_key.offset,
2249	name: &name, location: &log_key);
2250	if (ret && ret != -ENOENT && ret != -EEXIST) {
2251	btrfs_abort_log_replay(wc, ret,
2252	"failed to insert name %.*s for inode %llu dir %llu root %llu",
2253	name.len, name.name, log_key.objectid,
2254	btrfs_ino(dir), btrfs_root_id(root));
2255	goto out;
2256	}
2257	if (!ret)
2258	name_added = true;
2259	update_size = false;
2260	ret = `0`;
2261
2262	out:
2263	if (!ret && update_size) {
2264	btrfs_i_size_write(inode: dir, size: dir->vfs_inode.i_size + name.len * `2`);
2265	ret = btrfs_update_inode(trans, inode: dir);
2266	if (ret)
2267	btrfs_abort_log_replay(wc, ret,
2268	"failed to update dir inode %llu root %llu",
2269	btrfs_ino(dir), btrfs_root_id(root));
2270	}
2271	kfree(objp: name.name);
2272	iput(&dir->vfs_inode);
2273	if (!ret && name_added)
2274	ret = `1`;
2275	return ret;
2276	}
2277
2278	/ Replay one dir item from a BTRFS_DIR_INDEX_KEY key. /
2279	static noinline int replay_one_dir_item(struct walk_control *wc)
2280	{
2281	int ret;
2282	struct btrfs_dir_item *di;
2283
2284	/ We only log dir index keys, which only contain a single dir item. /
2285	ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY,
2286	"wc->log_key.type=%u", wc->log_key.type);
2287
2288	di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
2289	ret = replay_one_name(wc, di);
2290	if (ret < `0`)
2291	return ret;
2292
2293	/*
2294	* If this entry refers to a non-directory (directories can not have a
2295	* link count > 1) and it was added in the transaction that was not
2296	* committed, make sure we fixup the link count of the inode the entry
2297	* points to. Otherwise something like the following would result in a
2298	* directory pointing to an inode with a wrong link that does not account
2299	* for this dir entry:
2300	*
2301	* mkdir testdir
2302	* touch testdir/foo
2303	* touch testdir/bar
2304	* sync
2305	*
2306	* ln testdir/bar testdir/bar_link
2307	* ln testdir/foo testdir/foo_link
2308	* xfs_io -c "fsync" testdir/bar
2309	*
2310	* <power failure>
2311	*
2312	* mount fs, log replay happens
2313	*
2314	* File foo would remain with a link count of 1 when it has two entries
2315	* pointing to it in the directory testdir. This would make it impossible
2316	* to ever delete the parent directory has it would result in stale
2317	* dentries that can never be deleted.
2318	*/
2319	if (ret == `1` && btrfs_dir_ftype(eb: wc->log_leaf, item: di) != BTRFS_FT_DIR) {
2320	struct btrfs_key di_key;
2321
2322	btrfs_dir_item_key_to_cpu(eb: wc->log_leaf, item: di, cpu_key: &di_key);
2323	ret = link_to_fixup_dir(wc, objectid: di_key.objectid);
2324	}
2325
2326	return ret;
2327	}
2328
2329	/*
2330	* directory replay has two parts. There are the standard directory
2331	* items in the log copied from the subvolume, and range items
2332	* created in the log while the subvolume was logged.
2333	*
2334	* The range items tell us which parts of the key space the log
2335	* is authoritative for. During replay, if a key in the subvolume
2336	* directory is in a logged range item, but not actually in the log
2337	* that means it was deleted from the directory before the fsync
2338	* and should be removed.
2339	*/
2340	static noinline int find_dir_range(struct btrfs_root *root,
2341	struct btrfs_path *path,
2342	u64 dirid,
2343	u64 start_ret, u64 end_ret)
2344	{
2345	struct btrfs_key key;
2346	u64 found_end;
2347	struct btrfs_dir_log_item *item;
2348	int ret;
2349	int nritems;
2350
2351	if (*start_ret == (u64)-`1`)
2352	return `1`;
2353
2354	key.objectid = dirid;
2355	key.type = BTRFS_DIR_LOG_INDEX_KEY;
2356	key.offset = *start_ret;
2357
2358	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
2359	if (ret < `0`)
2360	goto out;
2361	if (ret > `0`) {
2362	if (path->slots[`0`] == `0`)
2363	goto out;
2364	path->slots[`0`]--;
2365	}
2366	if (ret != `0`)
2367	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
2368
2369	if (key.type != BTRFS_DIR_LOG_INDEX_KEY \|\| key.objectid != dirid) {
2370	ret = `1`;
2371	goto next;
2372	}
2373	item = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
2374	struct btrfs_dir_log_item);
2375	found_end = btrfs_dir_log_end(eb: path->nodes[`0`], s: item);
2376
2377	if (start_ret >= key.offset && start_ret <= found_end) {
2378	ret = `0`;
2379	*start_ret = key.offset;
2380	*end_ret = found_end;
2381	goto out;
2382	}
2383	ret = `1`;
2384	next:
2385	/ check the next slot in the tree to see if it is a valid item /
2386	nritems = btrfs_header_nritems(eb: path->nodes[`0`]);
2387	path->slots[`0`]++;
2388	if (path->slots[`0`] >= nritems) {
2389	ret = btrfs_next_leaf(root, path);
2390	if (ret)
2391	goto out;
2392	}
2393
2394	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
2395
2396	if (key.type != BTRFS_DIR_LOG_INDEX_KEY \|\| key.objectid != dirid) {
2397	ret = `1`;
2398	goto out;
2399	}
2400	item = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
2401	struct btrfs_dir_log_item);
2402	found_end = btrfs_dir_log_end(eb: path->nodes[`0`], s: item);
2403	*start_ret = key.offset;
2404	*end_ret = found_end;
2405	ret = `0`;
2406	out:
2407	btrfs_release_path(p: path);
2408	return ret;
2409	}
2410
2411	/*
2412	* this looks for a given directory item in the log. If the directory
2413	* item is not in the log, the item is removed and the inode it points
2414	* to is unlinked
2415	*/
2416	static noinline int check_item_in_log(struct walk_control *wc,
2417	struct btrfs_path *log_path,
2418	struct btrfs_inode *dir,
2419	struct btrfs_key *dir_key,
2420	bool force_remove)
2421	{
2422	struct btrfs_trans_handle *trans = wc->trans;
2423	struct btrfs_root *root = dir->root;
2424	int ret;
2425	struct extent_buffer *eb;
2426	int slot;
2427	struct btrfs_dir_item *di;
2428	struct fscrypt_str name = { `0` };
2429	struct btrfs_inode *inode = NULL;
2430	struct btrfs_key location;
2431
2432	/*
2433	* Currently we only log dir index keys. Even if we replay a log created
2434	* by an older kernel that logged both dir index and dir item keys, all
2435	* we need to do is process the dir index keys, we (and our caller) can
2436	* safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2437	*/
2438	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type);
2439
2440	eb = wc->subvol_path->nodes[`0`];
2441	slot = wc->subvol_path->slots[`0`];
2442	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2443	ret = read_alloc_one_name(eb, start: di + `1`, len: btrfs_dir_name_len(eb, s: di), name: &name);
2444	if (ret) {
2445	btrfs_abort_log_replay(wc, ret,
2446	"failed to allocate name for dir %llu index %llu root %llu",
2447	btrfs_ino(dir), dir_key->offset,
2448	btrfs_root_id(root));
2449	goto out;
2450	}
2451
2452	if (!force_remove) {
2453	struct btrfs_dir_item *log_di;
2454
2455	log_di = btrfs_lookup_dir_index_item(trans, root: wc->log, path: log_path,
2456	dir: dir_key->objectid,
2457	index: dir_key->offset, name: &name, mod: `0`);
2458	if (IS_ERR(ptr: log_di)) {
2459	ret = PTR_ERR(ptr: log_di);
2460	btrfs_abort_log_replay(wc, ret,
2461	"failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
2462	btrfs_ino(dir), dir_key->offset,
2463	name.len, name.name,
2464	btrfs_root_id(root));
2465	goto out;
2466	} else if (log_di) {
2467	/ The dentry exists in the log, we have nothing to do. /
2468	ret = `0`;
2469	goto out;
2470	}
2471	}
2472
2473	btrfs_dir_item_key_to_cpu(eb, item: di, cpu_key: &location);
2474	btrfs_release_path(p: wc->subvol_path);
2475	btrfs_release_path(p: log_path);
2476	inode = btrfs_iget_logging(objectid: location.objectid, root);
2477	if (IS_ERR(ptr: inode)) {
2478	ret = PTR_ERR(ptr: inode);
2479	inode = NULL;
2480	btrfs_abort_log_replay(wc, ret,
2481	"failed to lookup inode %llu root %llu",
2482	location.objectid, btrfs_root_id(root));
2483	goto out;
2484	}
2485
2486	ret = link_to_fixup_dir(wc, objectid: location.objectid);
2487	if (ret)
2488	goto out;
2489
2490	inc_nlink(inode: &inode->vfs_inode);
2491	ret = unlink_inode_for_log_replay(wc, dir, inode, name: &name);
2492	/*
2493	* Unlike dir item keys, dir index keys can only have one name (entry) in
2494	* them, as there are no key collisions since each key has a unique offset
2495	* (an index number), so we're done.
2496	*/
2497	out:
2498	btrfs_release_path(p: wc->subvol_path);
2499	btrfs_release_path(p: log_path);
2500	kfree(objp: name.name);
2501	if (inode)
2502	iput(&inode->vfs_inode);
2503	return ret;
2504	}
2505
2506	static int replay_xattr_deletes(struct walk_control *wc)
2507	{
2508	struct btrfs_trans_handle *trans = wc->trans;
2509	struct btrfs_root *root = wc->root;
2510	struct btrfs_root *log = wc->log;
2511	struct btrfs_key search_key;
2512	BTRFS_PATH_AUTO_FREE(log_path);
2513	const u64 ino = wc->log_key.objectid;
2514	int nritems;
2515	int ret;
2516
2517	log_path = btrfs_alloc_path();
2518	if (!log_path) {
2519	btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2520	return -ENOMEM;
2521	}
2522
2523	search_key.objectid = ino;
2524	search_key.type = BTRFS_XATTR_ITEM_KEY;
2525	search_key.offset = `0`;
2526	again:
2527	ret = btrfs_search_slot(NULL, root, key: &search_key, p: wc->subvol_path, ins_len: `0`, cow: `0`);
2528	if (ret < `0`) {
2529	btrfs_abort_log_replay(wc, ret,
2530	"failed to search xattrs for inode %llu root %llu",
2531	ino, btrfs_root_id(root));
2532	goto out;
2533	}
2534	process_leaf:
2535	nritems = btrfs_header_nritems(eb: wc->subvol_path->nodes[`0`]);
2536	for (int i = wc->subvol_path->slots[`0`]; i < nritems; i++) {
2537	struct btrfs_key key;
2538	struct btrfs_dir_item *di;
2539	struct btrfs_dir_item *log_di;
2540	u32 total_size;
2541	u32 cur;
2542
2543	btrfs_item_key_to_cpu(eb: wc->subvol_path->nodes[`0`], cpu_key: &key, nr: i);
2544	if (key.objectid != ino \|\| key.type != BTRFS_XATTR_ITEM_KEY) {
2545	ret = `0`;
2546	goto out;
2547	}
2548
2549	di = btrfs_item_ptr(wc->subvol_path->nodes[`0`], i, struct btrfs_dir_item);
2550	total_size = btrfs_item_size(eb: wc->subvol_path->nodes[`0`], slot: i);
2551	cur = `0`;
2552	while (cur < total_size) {
2553	u16 name_len = btrfs_dir_name_len(eb: wc->subvol_path->nodes[`0`], s: di);
2554	u16 data_len = btrfs_dir_data_len(eb: wc->subvol_path->nodes[`0`], s: di);
2555	u32 this_len = sizeof(*di) + name_len + data_len;
2556	char *name;
2557
2558	name = kmalloc(name_len, GFP_NOFS);
2559	if (!name) {
2560	ret = -ENOMEM;
2561	btrfs_abort_log_replay(wc, ret,
2562	"failed to allocate memory for name of length %u",
2563	name_len);
2564	goto out;
2565	}
2566	read_extent_buffer(eb: wc->subvol_path->nodes[`0`], dst: name,
2567	start: (unsigned long)(di + `1`), len: name_len);
2568
2569	log_di = btrfs_lookup_xattr(NULL, root: log, path: log_path, dir: ino,
2570	name, name_len, mod: `0`);
2571	btrfs_release_path(p: log_path);
2572	if (!log_di) {
2573	/ Doesn't exist in log tree, so delete it. /
2574	btrfs_release_path(p: wc->subvol_path);
2575	di = btrfs_lookup_xattr(trans, root, path: wc->subvol_path, dir: ino,
2576	name, name_len, mod: -`1`);
2577	if (IS_ERR(ptr: di)) {
2578	ret = PTR_ERR(ptr: di);
2579	btrfs_abort_log_replay(wc, ret,
2580	"failed to lookup xattr with name %.*s for inode %llu root %llu",
2581	name_len, name, ino,
2582	btrfs_root_id(root));
2583	kfree(objp: name);
2584	goto out;
2585	}
2586	ASSERT(di);
2587	ret = btrfs_delete_one_dir_name(trans, root,
2588	path: wc->subvol_path, di);
2589	if (ret) {
2590	btrfs_abort_log_replay(wc, ret,
2591	"failed to delete xattr with name %.*s for inode %llu root %llu",
2592	name_len, name, ino,
2593	btrfs_root_id(root));
2594	kfree(objp: name);
2595	goto out;
2596	}
2597	btrfs_release_path(p: wc->subvol_path);
2598	kfree(objp: name);
2599	search_key = key;
2600	goto again;
2601	}
2602	if (IS_ERR(ptr: log_di)) {
2603	ret = PTR_ERR(ptr: log_di);
2604	btrfs_abort_log_replay(wc, ret,
2605	"failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
2606	name_len, name, ino,
2607	btrfs_root_id(root));
2608	kfree(objp: name);
2609	goto out;
2610	}
2611	kfree(objp: name);
2612	cur += this_len;
2613	di = (struct btrfs_dir_item )((char* *)di + this_len);
2614	}
2615	}
2616	ret = btrfs_next_leaf(root, path: wc->subvol_path);
2617	if (ret > `0`)
2618	ret = `0`;
2619	else if (ret == `0`)
2620	goto process_leaf;
2621	else
2622	btrfs_abort_log_replay(wc, ret,
2623	"failed to get next leaf in subvolume root %llu",
2624	btrfs_root_id(root));
2625	out:
2626	btrfs_release_path(p: wc->subvol_path);
2627	return ret;
2628	}
2629
2630
2631	/*
2632	* deletion replay happens before we copy any new directory items
2633	* out of the log or out of backreferences from inodes. It
2634	* scans the log to find ranges of keys that log is authoritative for,
2635	* and then scans the directory to find items in those ranges that are
2636	* not present in the log.
2637	*
2638	* Anything we don't find in the log is unlinked and removed from the
2639	* directory.
2640	*/
2641	static noinline int replay_dir_deletes(struct walk_control *wc,
2642	u64 dirid, bool del_all)
2643	{
2644	struct btrfs_root *root = wc->root;
2645	struct btrfs_root *log = (del_all ? NULL : wc->log);
2646	u64 range_start;
2647	u64 range_end;
2648	int ret = `0`;
2649	struct btrfs_key dir_key;
2650	struct btrfs_key found_key;
2651	BTRFS_PATH_AUTO_FREE(log_path);
2652	struct btrfs_inode *dir;
2653
2654	dir_key.objectid = dirid;
2655	dir_key.type = BTRFS_DIR_INDEX_KEY;
2656	log_path = btrfs_alloc_path();
2657	if (!log_path) {
2658	btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2659	return -ENOMEM;
2660	}
2661
2662	dir = btrfs_iget_logging(objectid: dirid, root);
2663	/*
2664	* It isn't an error if the inode isn't there, that can happen because
2665	* we replay the deletes before we copy in the inode item from the log.
2666	*/
2667	if (IS_ERR(ptr: dir)) {
2668	ret = PTR_ERR(ptr: dir);
2669	if (ret == -ENOENT)
2670	ret = `0`;
2671	else
2672	btrfs_abort_log_replay(wc, ret,
2673	"failed to lookup dir inode %llu root %llu",
2674	dirid, btrfs_root_id(root));
2675	return ret;
2676	}
2677
2678	range_start = `0`;
2679	range_end = `0`;
2680	while (`1`) {
2681	if (del_all)
2682	range_end = (u64)-`1`;
2683	else {
2684	ret = find_dir_range(root: log, path: wc->subvol_path, dirid,
2685	start_ret: &range_start, end_ret: &range_end);
2686	if (ret < `0`) {
2687	btrfs_abort_log_replay(wc, ret,
2688	"failed to find range for dir %llu in log tree root %llu",
2689	dirid, btrfs_root_id(root));
2690	goto out;
2691	} else if (ret > `0`) {
2692	break;
2693	}
2694	}
2695
2696	dir_key.offset = range_start;
2697	while (`1`) {
2698	int nritems;
2699	ret = btrfs_search_slot(NULL, root, key: &dir_key,
2700	p: wc->subvol_path, ins_len: `0`, cow: `0`);
2701	if (ret < `0`) {
2702	btrfs_abort_log_replay(wc, ret,
2703	"failed to search root %llu for key " BTRFS_KEY_FMT,
2704	btrfs_root_id(root),
2705	BTRFS_KEY_FMT_VALUE(&dir_key));
2706	goto out;
2707	}
2708
2709	nritems = btrfs_header_nritems(eb: wc->subvol_path->nodes[`0`]);
2710	if (wc->subvol_path->slots[`0`] >= nritems) {
2711	ret = btrfs_next_leaf(root, path: wc->subvol_path);
2712	if (ret == `1`) {
2713	break;
2714	} else if (ret < `0`) {
2715	btrfs_abort_log_replay(wc, ret,
2716	"failed to get next leaf in subvolume root %llu",
2717	btrfs_root_id(root));
2718	goto out;
2719	}
2720	}
2721	btrfs_item_key_to_cpu(eb: wc->subvol_path->nodes[`0`], cpu_key: &found_key,
2722	nr: wc->subvol_path->slots[`0`]);
2723	if (found_key.objectid != dirid \|\|
2724	found_key.type != dir_key.type) {
2725	ret = `0`;
2726	goto out;
2727	}
2728
2729	if (found_key.offset > range_end)
2730	break;
2731
2732	ret = check_item_in_log(wc, log_path, dir, dir_key: &found_key, force_remove: del_all);
2733	if (ret)
2734	goto out;
2735	if (found_key.offset == (u64)-`1`)
2736	break;
2737	dir_key.offset = found_key.offset + `1`;
2738	}
2739	btrfs_release_path(p: wc->subvol_path);
2740	if (range_end == (u64)-`1`)
2741	break;
2742	range_start = range_end + `1`;
2743	}
2744	ret = `0`;
2745	out:
2746	btrfs_release_path(p: wc->subvol_path);
2747	iput(&dir->vfs_inode);
2748	return ret;
2749	}
2750
2751	/*
2752	* the process_func used to replay items from the log tree. This
2753	* gets called in two different stages. The first stage just looks
2754	* for inodes and makes sure they are all copied into the subvolume.
2755	*
2756	* The second stage copies all the other item types from the log into
2757	* the subvolume. The two stage approach is slower, but gets rid of
2758	* lots of complexity around inodes referencing other inodes that exist
2759	* only in the log (references come from either directory items or inode
2760	* back refs).
2761	*/
2762	static int replay_one_buffer(struct extent_buffer *eb,
2763	struct walk_control wc, u64 gen, int* level)
2764	{
2765	int nritems;
2766	struct btrfs_tree_parent_check check = {
2767	.transid = gen,
2768	.level = level
2769	};
2770	struct btrfs_root *root = wc->root;
2771	struct btrfs_trans_handle *trans = wc->trans;
2772	int ret;
2773
2774	if (level != `0`)
2775	return `0`;
2776
2777	/*
2778	* Set to NULL since it was not yet read and in case we abort log replay
2779	* on error, we have no valid log tree leaf to dump.
2780	*/
2781	wc->log_leaf = NULL;
2782	ret = btrfs_read_extent_buffer(buf: eb, check: &check);
2783	if (ret) {
2784	btrfs_abort_log_replay(wc, ret,
2785	"failed to read log tree leaf %llu for root %llu",
2786	eb->start, btrfs_root_id(root));
2787	return ret;
2788	}
2789
2790	ASSERT(wc->subvol_path == NULL);
2791	wc->subvol_path = btrfs_alloc_path();
2792	if (!wc->subvol_path) {
2793	btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2794	return -ENOMEM;
2795	}
2796
2797	wc->log_leaf = eb;
2798
2799	nritems = btrfs_header_nritems(eb);
2800	for (wc->log_slot = `0`; wc->log_slot < nritems; wc->log_slot++) {
2801	struct btrfs_inode_item *inode_item = NULL;
2802
2803	btrfs_item_key_to_cpu(eb, cpu_key: &wc->log_key, nr: wc->log_slot);
2804
2805	if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
2806	inode_item = btrfs_item_ptr(eb, wc->log_slot,
2807	struct btrfs_inode_item);
2808	/*
2809	* An inode with no links is either:
2810	*
2811	* 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
2812	* got linked before the fsync, skip it, as replaying
2813	* it is pointless since it would be deleted later.
2814	* We skip logging tmpfiles, but it's always possible
2815	* we are replaying a log created with a kernel that
2816	* used to log tmpfiles;
2817	*
2818	* 2) A non-tmpfile which got its last link deleted
2819	* while holding an open fd on it and later got
2820	* fsynced through that fd. We always log the
2821	* parent inodes when inode->last_unlink_trans is
2822	* set to the current transaction, so ignore all the
2823	* inode items for this inode. We will delete the
2824	* inode when processing the parent directory with
2825	* replay_dir_deletes().
2826	*/
2827	if (btrfs_inode_nlink(eb, s: inode_item) == `0`) {
2828	wc->ignore_cur_inode = true;
2829	continue;
2830	} else {
2831	wc->ignore_cur_inode = false;
2832	}
2833	}
2834
2835	/ Inode keys are done during the first stage. /
2836	if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
2837	wc->stage == LOG_WALK_REPLAY_INODES) {
2838	u32 mode;
2839
2840	ret = replay_xattr_deletes(wc);
2841	if (ret)
2842	break;
2843	mode = btrfs_inode_mode(eb, s: inode_item);
2844	if (S_ISDIR(mode)) {
2845	ret = replay_dir_deletes(wc, dirid: wc->log_key.objectid, del_all: false);
2846	if (ret)
2847	break;
2848	}
2849	ret = overwrite_item(wc);
2850	if (ret)
2851	break;
2852
2853	/*
2854	* Before replaying extents, truncate the inode to its
2855	* size. We need to do it now and not after log replay
2856	* because before an fsync we can have prealloc extents
2857	* added beyond the inode's i_size. If we did it after,
2858	* through orphan cleanup for example, we would drop
2859	* those prealloc extents just after replaying them.
2860	*/
2861	if (S_ISREG(mode)) {
2862	struct btrfs_drop_extents_args drop_args = { `0` };
2863	struct btrfs_inode *inode;
2864	u64 from;
2865
2866	inode = btrfs_iget_logging(objectid: wc->log_key.objectid, root);
2867	if (IS_ERR(ptr: inode)) {
2868	ret = PTR_ERR(ptr: inode);
2869	btrfs_abort_log_replay(wc, ret,
2870	"failed to lookup inode %llu root %llu",
2871	wc->log_key.objectid,
2872	btrfs_root_id(root));
2873	break;
2874	}
2875	from = ALIGN(i_size_read(&inode->vfs_inode),
2876	root->fs_info->sectorsize);
2877	drop_args.start = from;
2878	drop_args.end = (u64)-`1`;
2879	drop_args.drop_cache = true;
2880	drop_args.path = wc->subvol_path;
2881	ret = btrfs_drop_extents(trans, root, inode, args: &drop_args);
2882	if (ret) {
2883	btrfs_abort_log_replay(wc, ret,
2884	"failed to drop extents for inode %llu root %llu offset %llu",
2885	btrfs_ino(inode),
2886	btrfs_root_id(root),
2887	from);
2888	} else {
2889	inode_sub_bytes(inode: &inode->vfs_inode,
2890	bytes: drop_args.bytes_found);
2891	/ Update the inode's nbytes. /
2892	ret = btrfs_update_inode(trans, inode);
2893	if (ret)
2894	btrfs_abort_log_replay(wc, ret,
2895	"failed to update inode %llu root %llu",
2896	btrfs_ino(inode),
2897	btrfs_root_id(root));
2898	}
2899	iput(&inode->vfs_inode);
2900	if (ret)
2901	break;
2902	}
2903
2904	ret = link_to_fixup_dir(wc, objectid: wc->log_key.objectid);
2905	if (ret)
2906	break;
2907	}
2908
2909	if (wc->ignore_cur_inode)
2910	continue;
2911
2912	if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
2913	wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2914	ret = replay_one_dir_item(wc);
2915	if (ret)
2916	break;
2917	}
2918
2919	if (wc->stage < LOG_WALK_REPLAY_ALL)
2920	continue;
2921
2922	/ these keys are simply copied /
2923	if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
2924	ret = overwrite_item(wc);
2925	if (ret)
2926	break;
2927	} else if (wc->log_key.type == BTRFS_INODE_REF_KEY \|\|
2928	wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
2929	ret = add_inode_ref(wc);
2930	if (ret)
2931	break;
2932	} else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
2933	ret = replay_one_extent(wc);
2934	if (ret)
2935	break;
2936	}
2937	/*
2938	* We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2939	* BTRFS_DIR_INDEX_KEY items which we use to derive the
2940	* BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2941	* older kernel with such keys, ignore them.
2942	*/
2943	}
2944	btrfs_free_path(p: wc->subvol_path);
2945	wc->subvol_path = NULL;
2946	return ret;
2947	}
2948
2949	static int clean_log_buffer(struct btrfs_trans_handle *trans,
2950	struct extent_buffer *eb)
2951	{
2952	struct btrfs_fs_info *fs_info = eb->fs_info;
2953	struct btrfs_block_group *bg;
2954
2955	btrfs_tree_lock(eb);
2956	btrfs_clear_buffer_dirty(trans, buf: eb);
2957	wait_on_extent_buffer_writeback(eb);
2958	btrfs_tree_unlock(eb);
2959
2960	if (trans) {
2961	int ret;
2962
2963	ret = btrfs_pin_reserved_extent(trans, eb);
2964	if (ret)
2965	btrfs_abort_transaction(trans, ret);
2966	return ret;
2967	}
2968
2969	bg = btrfs_lookup_block_group(info: fs_info, bytenr: eb->start);
2970	if (!bg) {
2971	btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
2972	btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
2973	return -ENOENT;
2974	}
2975
2976	spin_lock(lock: &bg->space_info->lock);
2977	spin_lock(lock: &bg->lock);
2978	bg->reserved -= fs_info->nodesize;
2979	bg->space_info->bytes_reserved -= fs_info->nodesize;
2980	spin_unlock(lock: &bg->lock);
2981	spin_unlock(lock: &bg->space_info->lock);
2982
2983	btrfs_put_block_group(cache: bg);
2984
2985	return `0`;
2986	}
2987
2988	static noinline int walk_down_log_tree(struct btrfs_path path, int* *level,
2989	struct walk_control *wc)
2990	{
2991	struct btrfs_trans_handle *trans = wc->trans;
2992	struct btrfs_fs_info *fs_info = wc->log->fs_info;
2993	u64 bytenr;
2994	u64 ptr_gen;
2995	struct extent_buffer *next;
2996	struct extent_buffer *cur;
2997	int ret = `0`;
2998
2999	while (*level > `0`) {
3000	struct btrfs_tree_parent_check check = { `0` };
3001
3002	cur = path->nodes[*level];
3003
3004	WARN_ON(btrfs_header_level(cur) != *level);
3005
3006	if (path->slots[*level] >=
3007	btrfs_header_nritems(eb: cur))
3008	break;
3009
3010	bytenr = btrfs_node_blockptr(eb: cur, nr: path->slots[*level]);
3011	ptr_gen = btrfs_node_ptr_generation(eb: cur, nr: path->slots[*level]);
3012	check.transid = ptr_gen;
3013	check.level = *level - `1`;
3014	check.has_first_key = true;
3015	btrfs_node_key_to_cpu(eb: cur, cpu_key: &check.first_key, nr: path->slots[*level]);
3016
3017	next = btrfs_find_create_tree_block(fs_info, bytenr,
3018	owner_root: btrfs_header_owner(eb: cur),
3019	level: *level - `1`);
3020	if (IS_ERR(ptr: next)) {
3021	ret = PTR_ERR(ptr: next);
3022	if (trans)
3023	btrfs_abort_transaction(trans, ret);
3024	else
3025	btrfs_handle_fs_error(fs_info, ret, NULL);
3026	return ret;
3027	}
3028
3029	if (*level == `1`) {
3030	ret = wc->process_func(next, wc, ptr_gen, *level - `1`);
3031	if (ret) {
3032	free_extent_buffer(eb: next);
3033	return ret;
3034	}
3035
3036	path->slots[*level]++;
3037	if (wc->free) {
3038	ret = btrfs_read_extent_buffer(buf: next, check: &check);
3039	if (ret) {
3040	free_extent_buffer(eb: next);
3041	if (trans)
3042	btrfs_abort_transaction(trans, ret);
3043	else
3044	btrfs_handle_fs_error(fs_info, ret, NULL);
3045	return ret;
3046	}
3047
3048	ret = clean_log_buffer(trans, eb: next);
3049	if (ret) {
3050	free_extent_buffer(eb: next);
3051	return ret;
3052	}
3053	}
3054	free_extent_buffer(eb: next);
3055	continue;
3056	}
3057	ret = btrfs_read_extent_buffer(buf: next, check: &check);
3058	if (ret) {
3059	free_extent_buffer(eb: next);
3060	if (trans)
3061	btrfs_abort_transaction(trans, ret);
3062	else
3063	btrfs_handle_fs_error(fs_info, ret, NULL);
3064	return ret;
3065	}
3066
3067	if (path->nodes[*level-`1`])
3068	free_extent_buffer(eb: path->nodes[*level-`1`]);
3069	path->nodes[*level-`1`] = next;
3070	*level = btrfs_header_level(eb: next);
3071	path->slots[*level] = `0`;
3072	cond_resched();
3073	}
3074	path->slots[level] = btrfs_header_nritems(eb: path->nodes[level]);
3075
3076	cond_resched();
3077	return `0`;
3078	}
3079
3080	static noinline int walk_up_log_tree(struct btrfs_path path, int* *level,
3081	struct walk_control *wc)
3082	{
3083	int i;
3084	int slot;
3085	int ret;
3086
3087	for (i = *level; i < BTRFS_MAX_LEVEL - `1` && path->nodes[i]; i++) {
3088	slot = path->slots[i];
3089	if (slot + `1` < btrfs_header_nritems(eb: path->nodes[i])) {
3090	path->slots[i]++;
3091	*level = i;
3092	WARN_ON(*level == `0`);
3093	return `0`;
3094	} else {
3095	ret = wc->process_func(path->nodes[*level], wc,
3096	btrfs_header_generation(eb: path->nodes[*level]),
3097	*level);
3098	if (ret)
3099	return ret;
3100
3101	if (wc->free) {
3102	ret = clean_log_buffer(trans: wc->trans, eb: path->nodes[*level]);
3103	if (ret)
3104	return ret;
3105	}
3106	free_extent_buffer(eb: path->nodes[*level]);
3107	path->nodes[*level] = NULL;
3108	*level = i + `1`;
3109	}
3110	}
3111	return `1`;
3112	}
3113
3114	/*
3115	* drop the reference count on the tree rooted at 'snap'. This traverses
3116	* the tree freeing any blocks that have a ref count of zero after being
3117	* decremented.
3118	*/
3119	static int walk_log_tree(struct walk_control *wc)
3120	{
3121	struct btrfs_root *log = wc->log;
3122	int ret = `0`;
3123	int wret;
3124	int level;
3125	BTRFS_PATH_AUTO_FREE(path);
3126	int orig_level;
3127
3128	path = btrfs_alloc_path();
3129	if (!path)
3130	return -ENOMEM;
3131
3132	level = btrfs_header_level(eb: log->node);
3133	orig_level = level;
3134	path->nodes[level] = log->node;
3135	refcount_inc(r: &log->node->refs);
3136	path->slots[level] = `0`;
3137
3138	while (`1`) {
3139	wret = walk_down_log_tree(path, level: &level, wc);
3140	if (wret > `0`)
3141	break;
3142	if (wret < `0`)
3143	return wret;
3144
3145	wret = walk_up_log_tree(path, level: &level, wc);
3146	if (wret > `0`)
3147	break;
3148	if (wret < `0`)
3149	return wret;
3150	}
3151
3152	/ was the root node processed? if not, catch it here /
3153	if (path->nodes[orig_level]) {
3154	ret = wc->process_func(path->nodes[orig_level], wc,
3155	btrfs_header_generation(eb: path->nodes[orig_level]),
3156	orig_level);
3157	if (ret)
3158	return ret;
3159	if (wc->free)
3160	ret = clean_log_buffer(trans: wc->trans, eb: path->nodes[orig_level]);
3161	}
3162
3163	return ret;
3164	}
3165
3166	/*
3167	* helper function to update the item for a given subvolumes log root
3168	* in the tree of log roots
3169	*/
3170	static int update_log_root(struct btrfs_trans_handle *trans,
3171	struct btrfs_root *log,
3172	struct btrfs_root_item *root_item)
3173	{
3174	struct btrfs_fs_info *fs_info = log->fs_info;
3175	int ret;
3176
3177	if (log->log_transid == `1`) {
3178	/ insert root item on the first sync /
3179	ret = btrfs_insert_root(trans, root: fs_info->log_root_tree,
3180	key: &log->root_key, item: root_item);
3181	} else {
3182	ret = btrfs_update_root(trans, root: fs_info->log_root_tree,
3183	key: &log->root_key, item: root_item);
3184	}
3185	return ret;
3186	}
3187
3188	static void wait_log_commit(struct btrfs_root root, int* transid)
3189	{
3190	DEFINE_WAIT(wait);
3191	int index = transid % `2`;
3192
3193	/*
3194	* we only allow two pending log transactions at a time,
3195	* so we know that if ours is more than 2 older than the
3196	* current transaction, we're done
3197	*/
3198	for (;;) {
3199	prepare_to_wait(wq_head: &root->log_commit_wait[index],
3200	wq_entry: &wait, TASK_UNINTERRUPTIBLE);
3201
3202	if (!(root->log_transid_committed < transid &&
3203	atomic_read(v: &root->log_commit[index])))
3204	break;
3205
3206	mutex_unlock(lock: &root->log_mutex);
3207	schedule();
3208	mutex_lock(&root->log_mutex);
3209	}
3210	finish_wait(wq_head: &root->log_commit_wait[index], wq_entry: &wait);
3211	}
3212
3213	static void wait_for_writer(struct btrfs_root *root)
3214	{
3215	DEFINE_WAIT(wait);
3216
3217	for (;;) {
3218	prepare_to_wait(wq_head: &root->log_writer_wait, wq_entry: &wait,
3219	TASK_UNINTERRUPTIBLE);
3220	if (!atomic_read(v: &root->log_writers))
3221	break;
3222
3223	mutex_unlock(lock: &root->log_mutex);
3224	schedule();
3225	mutex_lock(&root->log_mutex);
3226	}
3227	finish_wait(wq_head: &root->log_writer_wait, wq_entry: &wait);
3228	}
3229
3230	void btrfs_init_log_ctx(struct btrfs_log_ctx ctx, struct* btrfs_inode *inode)
3231	{
3232	ctx->log_ret = `0`;
3233	ctx->log_transid = `0`;
3234	ctx->log_new_dentries = false;
3235	ctx->logging_new_name = false;
3236	ctx->logging_new_delayed_dentries = false;
3237	ctx->logged_before = false;
3238	ctx->inode = inode;
3239	INIT_LIST_HEAD(list: &ctx->list);
3240	INIT_LIST_HEAD(list: &ctx->ordered_extents);
3241	INIT_LIST_HEAD(list: &ctx->conflict_inodes);
3242	ctx->num_conflict_inodes = `0`;
3243	ctx->logging_conflict_inodes = false;
3244	ctx->scratch_eb = NULL;
3245	}
3246
3247	void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
3248	{
3249	struct btrfs_inode *inode = ctx->inode;
3250
3251	if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3252	!test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
3253	return;
3254
3255	/*
3256	* Don't care about allocation failure. This is just for optimization,
3257	* if we fail to allocate here, we will try again later if needed.
3258	*/
3259	ctx->scratch_eb = alloc_dummy_extent_buffer(fs_info: inode->root->fs_info, start: `0`);
3260	}
3261
3262	void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
3263	{
3264	struct btrfs_ordered_extent *ordered;
3265	struct btrfs_ordered_extent *tmp;
3266
3267	btrfs_assert_inode_locked(inode: ctx->inode);
3268
3269	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
3270	list_del_init(entry: &ordered->log_list);
3271	btrfs_put_ordered_extent(entry: ordered);
3272	}
3273	}
3274
3275
3276	static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3277	struct btrfs_log_ctx *ctx)
3278	{
3279	mutex_lock(&root->log_mutex);
3280	list_del_init(entry: &ctx->list);
3281	mutex_unlock(lock: &root->log_mutex);
3282	}
3283
3284	/*
3285	* Invoked in log mutex context, or be sure there is no other task which
3286	* can access the list.
3287	*/
3288	static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3289	int index, int error)
3290	{
3291	struct btrfs_log_ctx *ctx;
3292	struct btrfs_log_ctx *safe;
3293
3294	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3295	list_del_init(entry: &ctx->list);
3296	ctx->log_ret = error;
3297	}
3298	}
3299
3300	/*
3301	* Sends a given tree log down to the disk and updates the super blocks to
3302	* record it. When this call is done, you know that any inodes previously
3303	* logged are safely on disk only if it returns 0.
3304	*
3305	* Any other return value means you need to call btrfs_commit_transaction.
3306	* Some of the edge cases for fsyncing directories that have had unlinks
3307	* or renames done in the past mean that sometimes the only safe
3308	* fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
3309	* that has happened.
3310	*/
3311	int btrfs_sync_log(struct btrfs_trans_handle *trans,
3312	struct btrfs_root root, struct* btrfs_log_ctx *ctx)
3313	{
3314	int index1;
3315	int index2;
3316	int mark;
3317	int ret;
3318	struct btrfs_fs_info *fs_info = root->fs_info;
3319	struct btrfs_root *log = root->log_root;
3320	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3321	struct btrfs_root_item new_root_item;
3322	int log_transid = `0`;
3323	struct btrfs_log_ctx root_log_ctx;
3324	struct blk_plug plug;
3325	u64 log_root_start;
3326	u64 log_root_level;
3327
3328	mutex_lock(&root->log_mutex);
3329	log_transid = ctx->log_transid;
3330	if (root->log_transid_committed >= log_transid) {
3331	mutex_unlock(lock: &root->log_mutex);
3332	return ctx->log_ret;
3333	}
3334
3335	index1 = log_transid % `2`;
3336	if (atomic_read(v: &root->log_commit[index1])) {
3337	wait_log_commit(root, transid: log_transid);
3338	mutex_unlock(lock: &root->log_mutex);
3339	return ctx->log_ret;
3340	}
3341	ASSERT(log_transid == root->log_transid,
3342	"log_transid=%d root->log_transid=%d", log_transid, root->log_transid);
3343	atomic_set(v: &root->log_commit[index1], i: `1`);
3344
3345	/ wait for previous tree log sync to complete /
3346	if (atomic_read(v: &root->log_commit[(index1 + `1`) % `2`]))
3347	wait_log_commit(root, transid: log_transid - `1`);
3348
3349	while (`1`) {
3350	int batch = atomic_read(v: &root->log_batch);
3351	/ when we're on an ssd, just kick the log commit out /
3352	if (!btrfs_test_opt(fs_info, SSD) &&
3353	test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3354	mutex_unlock(lock: &root->log_mutex);
3355	schedule_timeout_uninterruptible(timeout: `1`);
3356	mutex_lock(&root->log_mutex);
3357	}
3358	wait_for_writer(root);
3359	if (batch == atomic_read(v: &root->log_batch))
3360	break;
3361	}
3362
3363	/ bail out if we need to do a full commit /
3364	if (btrfs_need_log_full_commit(trans)) {
3365	ret = BTRFS_LOG_FORCE_COMMIT;
3366	mutex_unlock(lock: &root->log_mutex);
3367	goto out;
3368	}
3369
3370	if (log_transid % `2` == `0`)
3371	mark = EXTENT_DIRTY_LOG1;
3372	else
3373	mark = EXTENT_DIRTY_LOG2;
3374
3375	/ we start IO on all the marked extents here, but we don't actually*
3376	* wait for them until later.
3377	*/
3378	blk_start_plug(&plug);
3379	ret = btrfs_write_marked_extents(fs_info, dirty_pages: &log->dirty_log_pages, mark);
3380	/*
3381	* -EAGAIN happens when someone, e.g., a concurrent transaction
3382	* commit, writes a dirty extent in this tree-log commit. This
3383	* concurrent write will create a hole writing out the extents,
3384	* and we cannot proceed on a zoned filesystem, requiring
3385	* sequential writing. While we can bail out to a full commit
3386	* here, but we can continue hoping the concurrent writing fills
3387	* the hole.
3388	*/
3389	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3390	ret = `0`;
3391	if (ret) {
3392	blk_finish_plug(&plug);
3393	btrfs_set_log_full_commit(trans);
3394	mutex_unlock(lock: &root->log_mutex);
3395	goto out;
3396	}
3397
3398	/*
3399	* We _must_ update under the root->log_mutex in order to make sure we
3400	* have a consistent view of the log root we are trying to commit at
3401	* this moment.
3402	*
3403	* We _must_ copy this into a local copy, because we are not holding the
3404	* log_root_tree->log_mutex yet. This is important because when we
3405	* commit the log_root_tree we must have a consistent view of the
3406	* log_root_tree when we update the super block to point at the
3407	* log_root_tree bytenr. If we update the log_root_tree here we'll race
3408	* with the commit and possibly point at the new block which we may not
3409	* have written out.
3410	*/
3411	btrfs_set_root_node(item: &log->root_item, node: log->node);
3412	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3413
3414	btrfs_set_root_log_transid(root, log_transid: root->log_transid + `1`);
3415	log->log_transid = root->log_transid;
3416	root->log_start_pid = `0`;
3417	/*
3418	* IO has been started, blocks of the log tree have WRITTEN flag set
3419	* in their headers. new modifications of the log will be written to
3420	* new positions. so it's safe to allow log writers to go in.
3421	*/
3422	mutex_unlock(lock: &root->log_mutex);
3423
3424	if (btrfs_is_zoned(fs_info)) {
3425	mutex_lock(&fs_info->tree_root->log_mutex);
3426	if (!log_root_tree->node) {
3427	ret = btrfs_alloc_log_tree_node(trans, root: log_root_tree);
3428	if (ret) {
3429	mutex_unlock(lock: &fs_info->tree_root->log_mutex);
3430	blk_finish_plug(&plug);
3431	goto out;
3432	}
3433	}
3434	mutex_unlock(lock: &fs_info->tree_root->log_mutex);
3435	}
3436
3437	btrfs_init_log_ctx(ctx: &root_log_ctx, NULL);
3438
3439	mutex_lock(&log_root_tree->log_mutex);
3440
3441	index2 = log_root_tree->log_transid % `2`;
3442	list_add_tail(new: &root_log_ctx.list, head: &log_root_tree->log_ctxs[index2]);
3443	root_log_ctx.log_transid = log_root_tree->log_transid;
3444
3445	/*
3446	* Now we are safe to update the log_root_tree because we're under the
3447	* log_mutex, and we're a current writer so we're holding the commit
3448	* open until we drop the log_mutex.
3449	*/
3450	ret = update_log_root(trans, log, root_item: &new_root_item);
3451	if (ret) {
3452	list_del_init(entry: &root_log_ctx.list);
3453	blk_finish_plug(&plug);
3454	btrfs_set_log_full_commit(trans);
3455	if (ret != -ENOSPC)
3456	btrfs_err(fs_info,
3457	"failed to update log for root %llu ret %d",
3458	btrfs_root_id(root), ret);
3459	btrfs_wait_tree_log_extents(root: log, mark);
3460	mutex_unlock(lock: &log_root_tree->log_mutex);
3461	goto out;
3462	}
3463
3464	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3465	blk_finish_plug(&plug);
3466	list_del_init(entry: &root_log_ctx.list);
3467	mutex_unlock(lock: &log_root_tree->log_mutex);
3468	ret = root_log_ctx.log_ret;
3469	goto out;
3470	}
3471
3472	if (atomic_read(v: &log_root_tree->log_commit[index2])) {
3473	blk_finish_plug(&plug);
3474	ret = btrfs_wait_tree_log_extents(root: log, mark);
3475	wait_log_commit(root: log_root_tree,
3476	transid: root_log_ctx.log_transid);
3477	mutex_unlock(lock: &log_root_tree->log_mutex);
3478	if (!ret)
3479	ret = root_log_ctx.log_ret;
3480	goto out;
3481	}
3482	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid,
3483	"root_log_ctx.log_transid=%d log_root_tree->log_transid=%d",
3484	root_log_ctx.log_transid, log_root_tree->log_transid);
3485	atomic_set(v: &log_root_tree->log_commit[index2], i: `1`);
3486
3487	if (atomic_read(v: &log_root_tree->log_commit[(index2 + `1`) % `2`])) {
3488	wait_log_commit(root: log_root_tree,
3489	transid: root_log_ctx.log_transid - `1`);
3490	}
3491
3492	/*
3493	* now that we've moved on to the tree of log tree roots,
3494	* check the full commit flag again
3495	*/
3496	if (btrfs_need_log_full_commit(trans)) {
3497	blk_finish_plug(&plug);
3498	btrfs_wait_tree_log_extents(root: log, mark);
3499	mutex_unlock(lock: &log_root_tree->log_mutex);
3500	ret = BTRFS_LOG_FORCE_COMMIT;
3501	goto out_wake_log_root;
3502	}
3503
3504	ret = btrfs_write_marked_extents(fs_info,
3505	dirty_pages: &log_root_tree->dirty_log_pages,
3506	mark: EXTENT_DIRTY_LOG1 \| EXTENT_DIRTY_LOG2);
3507	blk_finish_plug(&plug);
3508	/*
3509	* As described above, -EAGAIN indicates a hole in the extents. We
3510	* cannot wait for these write outs since the waiting cause a
3511	* deadlock. Bail out to the full commit instead.
3512	*/
3513	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3514	btrfs_set_log_full_commit(trans);
3515	btrfs_wait_tree_log_extents(root: log, mark);
3516	mutex_unlock(lock: &log_root_tree->log_mutex);
3517	goto out_wake_log_root;
3518	} else if (ret) {
3519	btrfs_set_log_full_commit(trans);
3520	mutex_unlock(lock: &log_root_tree->log_mutex);
3521	goto out_wake_log_root;
3522	}
3523	ret = btrfs_wait_tree_log_extents(root: log, mark);
3524	if (!ret)
3525	ret = btrfs_wait_tree_log_extents(root: log_root_tree,
3526	mark: EXTENT_DIRTY_LOG1 \| EXTENT_DIRTY_LOG2);
3527	if (ret) {
3528	btrfs_set_log_full_commit(trans);
3529	mutex_unlock(lock: &log_root_tree->log_mutex);
3530	goto out_wake_log_root;
3531	}
3532
3533	log_root_start = log_root_tree->node->start;
3534	log_root_level = btrfs_header_level(eb: log_root_tree->node);
3535	log_root_tree->log_transid++;
3536	mutex_unlock(lock: &log_root_tree->log_mutex);
3537
3538	/*
3539	* Here we are guaranteed that nobody is going to write the superblock
3540	* for the current transaction before us and that neither we do write
3541	* our superblock before the previous transaction finishes its commit
3542	* and writes its superblock, because:
3543	*
3544	* 1) We are holding a handle on the current transaction, so no body
3545	* can commit it until we release the handle;
3546	*
3547	* 2) Before writing our superblock we acquire the tree_log_mutex, so
3548	* if the previous transaction is still committing, and hasn't yet
3549	* written its superblock, we wait for it to do it, because a
3550	* transaction commit acquires the tree_log_mutex when the commit
3551	* begins and releases it only after writing its superblock.
3552	*/
3553	mutex_lock(&fs_info->tree_log_mutex);
3554
3555	/*
3556	* The previous transaction writeout phase could have failed, and thus
3557	* marked the fs in an error state. We must not commit here, as we
3558	* could have updated our generation in the super_for_commit and
3559	* writing the super here would result in transid mismatches. If there
3560	* is an error here just bail.
3561	*/
3562	if (BTRFS_FS_ERROR(fs_info)) {
3563	ret = -EIO;
3564	btrfs_set_log_full_commit(trans);
3565	btrfs_abort_transaction(trans, ret);
3566	mutex_unlock(lock: &fs_info->tree_log_mutex);
3567	goto out_wake_log_root;
3568	}
3569
3570	btrfs_set_super_log_root(s: fs_info->super_for_commit, val: log_root_start);
3571	btrfs_set_super_log_root_level(s: fs_info->super_for_commit, val: log_root_level);
3572	ret = write_all_supers(fs_info, max_mirrors: `1`);
3573	mutex_unlock(lock: &fs_info->tree_log_mutex);
3574	if (unlikely(ret)) {
3575	btrfs_set_log_full_commit(trans);
3576	btrfs_abort_transaction(trans, ret);
3577	goto out_wake_log_root;
3578	}
3579
3580	/*
3581	* We know there can only be one task here, since we have not yet set
3582	* root->log_commit[index1] to 0 and any task attempting to sync the
3583	* log must wait for the previous log transaction to commit if it's
3584	* still in progress or wait for the current log transaction commit if
3585	* someone else already started it. We use <= and not < because the
3586	* first log transaction has an ID of 0.
3587	*/
3588	ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid,
3589	"last_log_commit(root)=%d log_transid=%d",
3590	btrfs_get_root_last_log_commit(root), log_transid);
3591	btrfs_set_root_last_log_commit(root, commit_id: log_transid);
3592
3593	out_wake_log_root:
3594	mutex_lock(&log_root_tree->log_mutex);
3595	btrfs_remove_all_log_ctxs(root: log_root_tree, index: index2, error: ret);
3596
3597	log_root_tree->log_transid_committed++;
3598	atomic_set(v: &log_root_tree->log_commit[index2], i: `0`);
3599	mutex_unlock(lock: &log_root_tree->log_mutex);
3600
3601	/*
3602	* The barrier before waitqueue_active (in cond_wake_up) is needed so
3603	* all the updates above are seen by the woken threads. It might not be
3604	* necessary, but proving that seems to be hard.
3605	*/
3606	cond_wake_up(wq: &log_root_tree->log_commit_wait[index2]);
3607	out:
3608	mutex_lock(&root->log_mutex);
3609	btrfs_remove_all_log_ctxs(root, index: index1, error: ret);
3610	root->log_transid_committed++;
3611	atomic_set(v: &root->log_commit[index1], i: `0`);
3612	mutex_unlock(lock: &root->log_mutex);
3613
3614	/*
3615	* The barrier before waitqueue_active (in cond_wake_up) is needed so
3616	* all the updates above are seen by the woken threads. It might not be
3617	* necessary, but proving that seems to be hard.
3618	*/
3619	cond_wake_up(wq: &root->log_commit_wait[index1]);
3620	return ret;
3621	}
3622
3623	static void free_log_tree(struct btrfs_trans_handle *trans,
3624	struct btrfs_root *log)
3625	{
3626	int ret;
3627	struct walk_control wc = {
3628	.free = true,
3629	.process_func = process_one_buffer,
3630	.log = log,
3631	.trans = trans,
3632	};
3633
3634	if (log->node) {
3635	ret = walk_log_tree(wc: &wc);
3636	if (ret) {
3637	/*
3638	* We weren't able to traverse the entire log tree, the
3639	* typical scenario is getting an -EIO when reading an
3640	* extent buffer of the tree, due to a previous writeback
3641	* failure of it.
3642	*/
3643	set_bit(nr: BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3644	addr: &log->fs_info->fs_state);
3645
3646	/*
3647	* Some extent buffers of the log tree may still be dirty
3648	* and not yet written back to storage, because we may
3649	* have updates to a log tree without syncing a log tree,
3650	* such as during rename and link operations. So flush
3651	* them out and wait for their writeback to complete, so
3652	* that we properly cleanup their state and pages.
3653	*/
3654	btrfs_write_marked_extents(fs_info: log->fs_info,
3655	dirty_pages: &log->dirty_log_pages,
3656	mark: EXTENT_DIRTY_LOG1 \| EXTENT_DIRTY_LOG2);
3657	btrfs_wait_tree_log_extents(root: log,
3658	mark: EXTENT_DIRTY_LOG1 \| EXTENT_DIRTY_LOG2);
3659
3660	if (trans)
3661	btrfs_abort_transaction(trans, ret);
3662	else
3663	btrfs_handle_fs_error(log->fs_info, ret, NULL);
3664	}
3665	}
3666
3667	btrfs_extent_io_tree_release(tree: &log->dirty_log_pages);
3668	btrfs_extent_io_tree_release(tree: &log->log_csum_range);
3669
3670	btrfs_put_root(root: log);
3671	}
3672
3673	/*
3674	* free all the extents used by the tree log. This should be called
3675	* at commit time of the full transaction
3676	*/
3677	int btrfs_free_log(struct btrfs_trans_handle trans, struct* btrfs_root *root)
3678	{
3679	if (root->log_root) {
3680	free_log_tree(trans, log: root->log_root);
3681	root->log_root = NULL;
3682	clear_bit(nr: BTRFS_ROOT_HAS_LOG_TREE, addr: &root->state);
3683	}
3684	return `0`;
3685	}
3686
3687	int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3688	struct btrfs_fs_info *fs_info)
3689	{
3690	if (fs_info->log_root_tree) {
3691	free_log_tree(trans, log: fs_info->log_root_tree);
3692	fs_info->log_root_tree = NULL;
3693	clear_bit(nr: BTRFS_ROOT_HAS_LOG_TREE, addr: &fs_info->tree_root->state);
3694	}
3695	return `0`;
3696	}
3697
3698	static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
3699	struct btrfs_inode *inode)
3700	{
3701	bool ret = false;
3702
3703	/*
3704	* Do this only if ->logged_trans is still 0 to prevent races with
3705	* concurrent logging as we may see the inode not logged when
3706	* inode_logged() is called but it gets logged after inode_logged() did
3707	* not find it in the log tree and we end up setting ->logged_trans to a
3708	* value less than trans->transid after the concurrent logging task has
3709	* set it to trans->transid. As a consequence, subsequent rename, unlink
3710	* and link operations may end up not logging new names and removing old
3711	* names from the log.
3712	*/
3713	spin_lock(lock: &inode->lock);
3714	if (inode->logged_trans == `0`)
3715	inode->logged_trans = trans->transid - `1`;
3716	else if (inode->logged_trans == trans->transid)
3717	ret = true;
3718	spin_unlock(lock: &inode->lock);
3719
3720	return ret;
3721	}
3722
3723	/*
3724	* Check if an inode was logged in the current transaction. This correctly deals
3725	* with the case where the inode was logged but has a logged_trans of 0, which
3726	* happens if the inode is evicted and loaded again, as logged_trans is an in
3727	* memory only field (not persisted).
3728	*
3729	* Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3730	* and < 0 on error.
3731	*/
3732	static int inode_logged(const struct btrfs_trans_handle *trans,
3733	struct btrfs_inode *inode,
3734	struct btrfs_path *path_in)
3735	{
3736	struct btrfs_path *path = path_in;
3737	struct btrfs_key key;
3738	int ret;
3739
3740	/*
3741	* Quick lockless call, since once ->logged_trans is set to the current
3742	* transaction, we never set it to a lower value anywhere else.
3743	*/
3744	if (data_race(inode->logged_trans) == trans->transid)
3745	return `1`;
3746
3747	/*
3748	* If logged_trans is not 0 and not trans->transid, then we know the
3749	* inode was not logged in this transaction, so we can return false
3750	* right away. We take the lock to avoid a race caused by load/store
3751	* tearing with a concurrent btrfs_log_inode() call or a concurrent task
3752	* in this function further below - an update to trans->transid can be
3753	* teared into two 32 bits updates for example, in which case we could
3754	* see a positive value that is not trans->transid and assume the inode
3755	* was not logged when it was.
3756	*/
3757	spin_lock(lock: &inode->lock);
3758	if (inode->logged_trans == trans->transid) {
3759	spin_unlock(lock: &inode->lock);
3760	return `1`;
3761	} else if (inode->logged_trans > `0`) {
3762	spin_unlock(lock: &inode->lock);
3763	return `0`;
3764	}
3765	spin_unlock(lock: &inode->lock);
3766
3767	/*
3768	* If no log tree was created for this root in this transaction, then
3769	* the inode can not have been logged in this transaction. In that case
3770	* set logged_trans to anything greater than 0 and less than the current
3771	* transaction's ID, to avoid the search below in a future call in case
3772	* a log tree gets created after this.
3773	*/
3774	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
3775	return mark_inode_as_not_logged(trans, inode);
3776
3777	/*
3778	* We have a log tree and the inode's logged_trans is 0. We can't tell
3779	* for sure if the inode was logged before in this transaction by looking
3780	* only at logged_trans. We could be pessimistic and assume it was, but
3781	* that can lead to unnecessarily logging an inode during rename and link
3782	* operations, and then further updating the log in followup rename and
3783	* link operations, specially if it's a directory, which adds latency
3784	* visible to applications doing a series of rename or link operations.
3785	*
3786	* A logged_trans of 0 here can mean several things:
3787	*
3788	* 1) The inode was never logged since the filesystem was mounted, and may
3789	* or may have not been evicted and loaded again;
3790	*
3791	* 2) The inode was logged in a previous transaction, then evicted and
3792	* then loaded again;
3793	*
3794	* 3) The inode was logged in the current transaction, then evicted and
3795	* then loaded again.
3796	*
3797	* For cases 1) and 2) we don't want to return true, but we need to detect
3798	* case 3) and return true. So we do a search in the log root for the inode
3799	* item.
3800	*/
3801	key.objectid = btrfs_ino(inode);
3802	key.type = BTRFS_INODE_ITEM_KEY;
3803	key.offset = `0`;
3804
3805	if (!path) {
3806	path = btrfs_alloc_path();
3807	if (!path)
3808	return -ENOMEM;
3809	}
3810
3811	ret = btrfs_search_slot(NULL, root: inode->root->log_root, key: &key, p: path, ins_len: `0`, cow: `0`);
3812
3813	if (path_in)
3814	btrfs_release_path(p: path);
3815	else
3816	btrfs_free_path(p: path);
3817
3818	/*
3819	* Logging an inode always results in logging its inode item. So if we
3820	* did not find the item we know the inode was not logged for sure.
3821	*/
3822	if (ret < `0`) {
3823	return ret;
3824	} else if (ret > `0`) {
3825	/*
3826	* Set logged_trans to a value greater than 0 and less then the
3827	* current transaction to avoid doing the search in future calls.
3828	*/
3829	return mark_inode_as_not_logged(trans, inode);
3830	}
3831
3832	/*
3833	* The inode was previously logged and then evicted, set logged_trans to
3834	* the current transaction's ID, to avoid future tree searches as long as
3835	* the inode is not evicted again.
3836	*/
3837	spin_lock(lock: &inode->lock);
3838	inode->logged_trans = trans->transid;
3839	spin_unlock(lock: &inode->lock);
3840
3841	return `1`;
3842	}
3843
3844	/*
3845	* Delete a directory entry from the log if it exists.
3846	*
3847	* Returns < 0 on error
3848	* 1 if the entry does not exists
3849	* 0 if the entry existed and was successfully deleted
3850	*/
3851	static int del_logged_dentry(struct btrfs_trans_handle *trans,
3852	struct btrfs_root *log,
3853	struct btrfs_path *path,
3854	u64 dir_ino,
3855	const struct fscrypt_str *name,
3856	u64 index)
3857	{
3858	struct btrfs_dir_item *di;
3859
3860	/*
3861	* We only log dir index items of a directory, so we don't need to look
3862	* for dir item keys.
3863	*/
3864	di = btrfs_lookup_dir_index_item(trans, root: log, path, dir: dir_ino,
3865	index, name, mod: -`1`);
3866	if (IS_ERR(ptr: di))
3867	return PTR_ERR(ptr: di);
3868	else if (!di)
3869	return `1`;
3870
3871	/*
3872	* We do not need to update the size field of the directory's
3873	* inode item because on log replay we update the field to reflect
3874	* all existing entries in the directory (see overwrite_item()).
3875	*/
3876	return btrfs_del_item(trans, root: log, path);
3877	}
3878
3879	/*
3880	* If both a file and directory are logged, and unlinks or renames are
3881	* mixed in, we have a few interesting corners:
3882	*
3883	* create file X in dir Y
3884	* link file X to X.link in dir Y
3885	* fsync file X
3886	* unlink file X but leave X.link
3887	* fsync dir Y
3888	*
3889	* After a crash we would expect only X.link to exist. But file X
3890	* didn't get fsync'd again so the log has back refs for X and X.link.
3891	*
3892	* We solve this by removing directory entries and inode backrefs from the
3893	* log when a file that was logged in the current transaction is
3894	* unlinked. Any later fsync will include the updated log entries, and
3895	* we'll be able to reconstruct the proper directory items from backrefs.
3896	*
3897	* This optimizations allows us to avoid relogging the entire inode
3898	* or the entire directory.
3899	*/
3900	void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3901	const struct fscrypt_str *name,
3902	struct btrfs_inode *dir, u64 index)
3903	{
3904	struct btrfs_root *root = dir->root;
3905	BTRFS_PATH_AUTO_FREE(path);
3906	int ret;
3907
3908	ret = inode_logged(trans, inode: dir, NULL);
3909	if (ret == `0`)
3910	return;
3911	if (ret < `0`) {
3912	btrfs_set_log_full_commit(trans);
3913	return;
3914	}
3915
3916	path = btrfs_alloc_path();
3917	if (!path) {
3918	btrfs_set_log_full_commit(trans);
3919	return;
3920	}
3921
3922	ret = join_running_log_trans(root);
3923	ASSERT(ret == `0`, "join_running_log_trans() ret=%d", ret);
3924	if (WARN_ON(ret))
3925	return;
3926
3927	mutex_lock(&dir->log_mutex);
3928
3929	ret = del_logged_dentry(trans, log: root->log_root, path, dir_ino: btrfs_ino(inode: dir),
3930	name, index);
3931	mutex_unlock(lock: &dir->log_mutex);
3932	if (ret < `0`)
3933	btrfs_set_log_full_commit(trans);
3934	btrfs_end_log_trans(root);
3935	}
3936
3937	/ see comments for btrfs_del_dir_entries_in_log /
3938	void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3939	const struct fscrypt_str *name,
3940	struct btrfs_inode *inode,
3941	struct btrfs_inode *dir)
3942	{
3943	struct btrfs_root *root = dir->root;
3944	int ret;
3945
3946	ret = inode_logged(trans, inode, NULL);
3947	if (ret == `0`)
3948	return;
3949	else if (ret < `0`) {
3950	btrfs_set_log_full_commit(trans);
3951	return;
3952	}
3953
3954	ret = join_running_log_trans(root);
3955	ASSERT(ret == `0`, "join_running_log_trans() ret=%d", ret);
3956	if (WARN_ON(ret))
3957	return;
3958	mutex_lock(&inode->log_mutex);
3959
3960	ret = btrfs_del_inode_ref(trans, root: root->log_root, name, inode_objectid: btrfs_ino(inode),
3961	ref_objectid: btrfs_ino(inode: dir), NULL);
3962	mutex_unlock(lock: &inode->log_mutex);
3963	if (ret < `0` && ret != -ENOENT)
3964	btrfs_set_log_full_commit(trans);
3965	btrfs_end_log_trans(root);
3966	}
3967
3968	/*
3969	* creates a range item in the log for 'dirid'. first_offset and
3970	* last_offset tell us which parts of the key space the log should
3971	* be considered authoritative for.
3972	*/
3973	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3974	struct btrfs_root *log,
3975	struct btrfs_path *path,
3976	u64 dirid,
3977	u64 first_offset, u64 last_offset)
3978	{
3979	int ret;
3980	struct btrfs_key key;
3981	struct btrfs_dir_log_item *item;
3982
3983	key.objectid = dirid;
3984	key.type = BTRFS_DIR_LOG_INDEX_KEY;
3985	key.offset = first_offset;
3986	ret = btrfs_insert_empty_item(trans, root: log, path, key: &key, data_size: sizeof(*item));
3987	/*
3988	* -EEXIST is fine and can happen sporadically when we are logging a
3989	* directory and have concurrent insertions in the subvolume's tree for
3990	* items from other inodes and that result in pushing off some dir items
3991	* from one leaf to another in order to accommodate for the new items.
3992	* This results in logging the same dir index range key.
3993	*/
3994	if (ret && ret != -EEXIST)
3995	return ret;
3996
3997	item = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
3998	struct btrfs_dir_log_item);
3999	if (ret == -EEXIST) {
4000	const u64 curr_end = btrfs_dir_log_end(eb: path->nodes[`0`], s: item);
4001
4002	/*
4003	* btrfs_del_dir_entries_in_log() might have been called during
4004	* an unlink between the initial insertion of this key and the
4005	* current update, or we might be logging a single entry deletion
4006	* during a rename, so set the new last_offset to the max value.
4007	*/
4008	last_offset = max(last_offset, curr_end);
4009	}
4010	btrfs_set_dir_log_end(eb: path->nodes[`0`], s: item, val: last_offset);
4011	btrfs_release_path(p: path);
4012	return `0`;
4013	}
4014
4015	static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
4016	struct btrfs_inode *inode,
4017	struct extent_buffer *src,
4018	struct btrfs_path *dst_path,
4019	int start_slot,
4020	int count)
4021	{
4022	struct btrfs_root *log = inode->root->log_root;
4023	char AUTO_KFREE(ins_data);
4024	struct btrfs_item_batch batch;
4025	struct extent_buffer *dst;
4026	unsigned long src_offset;
4027	unsigned long dst_offset;
4028	u64 last_index;
4029	struct btrfs_key key;
4030	u32 item_size;
4031	int ret;
4032	int i;
4033
4034	ASSERT(count > `0`, "count=%d", count);
4035	batch.nr = count;
4036
4037	if (count == `1`) {
4038	btrfs_item_key_to_cpu(eb: src, cpu_key: &key, nr: start_slot);
4039	item_size = btrfs_item_size(eb: src, slot: start_slot);
4040	batch.keys = &key;
4041	batch.data_sizes = &item_size;
4042	batch.total_data_size = item_size;
4043	} else {
4044	struct btrfs_key *ins_keys;
4045	u32 *ins_sizes;
4046
4047	ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
4048	if (!ins_data)
4049	return -ENOMEM;
4050
4051	ins_sizes = (u32 *)ins_data;
4052	ins_keys = (struct btrfs_key )(ins_data + count sizeof(u32));
4053	batch.keys = ins_keys;
4054	batch.data_sizes = ins_sizes;
4055	batch.total_data_size = `0`;
4056
4057	for (i = `0`; i < count; i++) {
4058	const int slot = start_slot + i;
4059
4060	btrfs_item_key_to_cpu(eb: src, cpu_key: &ins_keys[i], nr: slot);
4061	ins_sizes[i] = btrfs_item_size(eb: src, slot);
4062	batch.total_data_size += ins_sizes[i];
4063	}
4064	}
4065
4066	ret = btrfs_insert_empty_items(trans, root: log, path: dst_path, batch: &batch);
4067	if (ret)
4068	return ret;
4069
4070	dst = dst_path->nodes[`0`];
4071	/*
4072	* Copy all the items in bulk, in a single copy operation. Item data is
4073	* organized such that it's placed at the end of a leaf and from right
4074	* to left. For example, the data for the second item ends at an offset
4075	* that matches the offset where the data for the first item starts, the
4076	* data for the third item ends at an offset that matches the offset
4077	* where the data of the second items starts, and so on.
4078	* Therefore our source and destination start offsets for copy match the
4079	* offsets of the last items (highest slots).
4080	*/
4081	dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[`0`] + count - `1`);
4082	src_offset = btrfs_item_ptr_offset(src, start_slot + count - `1`);
4083	copy_extent_buffer(dst, src, dst_offset, src_offset, len: batch.total_data_size);
4084	btrfs_release_path(p: dst_path);
4085
4086	last_index = batch.keys[count - `1`].offset;
4087	ASSERT(last_index > inode->last_dir_index_offset,
4088	"last_index=%llu inode->last_dir_index_offset=%llu",
4089	last_index, inode->last_dir_index_offset);
4090
4091	/*
4092	* If for some unexpected reason the last item's index is not greater
4093	* than the last index we logged, warn and force a transaction commit.
4094	*/
4095	if (WARN_ON(last_index <= inode->last_dir_index_offset))
4096	ret = BTRFS_LOG_FORCE_COMMIT;
4097	else
4098	inode->last_dir_index_offset = last_index;
4099
4100	if (btrfs_get_first_dir_index_to_log(inode) == `0`)
4101	btrfs_set_first_dir_index_to_log(inode, index: batch.keys[`0`].offset);
4102
4103	return ret;
4104	}
4105
4106	static int clone_leaf(struct btrfs_path path, struct* btrfs_log_ctx *ctx)
4107	{
4108	const int slot = path->slots[`0`];
4109
4110	if (ctx->scratch_eb) {
4111	copy_extent_buffer_full(dst: ctx->scratch_eb, src: path->nodes[`0`]);
4112	} else {
4113	ctx->scratch_eb = btrfs_clone_extent_buffer(src: path->nodes[`0`]);
4114	if (!ctx->scratch_eb)
4115	return -ENOMEM;
4116	}
4117
4118	btrfs_release_path(p: path);
4119	path->nodes[`0`] = ctx->scratch_eb;
4120	path->slots[`0`] = slot;
4121	/*
4122	* Add extra ref to scratch eb so that it is not freed when callers
4123	* release the path, so we can reuse it later if needed.
4124	*/
4125	refcount_inc(r: &ctx->scratch_eb->refs);
4126
4127	return `0`;
4128	}
4129
4130	static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
4131	struct btrfs_inode *inode,
4132	struct btrfs_path *path,
4133	struct btrfs_path *dst_path,
4134	struct btrfs_log_ctx *ctx,
4135	u64 *last_old_dentry_offset)
4136	{
4137	struct btrfs_root *log = inode->root->log_root;
4138	struct extent_buffer *src;
4139	const int nritems = btrfs_header_nritems(eb: path->nodes[`0`]);
4140	const u64 ino = btrfs_ino(inode);
4141	bool last_found = false;
4142	int batch_start = `0`;
4143	int batch_size = `0`;
4144	int ret;
4145
4146	/*
4147	* We need to clone the leaf, release the read lock on it, and use the
4148	* clone before modifying the log tree. See the comment at copy_items()
4149	* about why we need to do this.
4150	*/
4151	ret = clone_leaf(path, ctx);
4152	if (ret < `0`)
4153	return ret;
4154
4155	src = path->nodes[`0`];
4156
4157	for (int i = path->slots[`0`]; i < nritems; i++) {
4158	struct btrfs_dir_item *di;
4159	struct btrfs_key key;
4160
4161	btrfs_item_key_to_cpu(eb: src, cpu_key: &key, nr: i);
4162
4163	if (key.objectid != ino \|\| key.type != BTRFS_DIR_INDEX_KEY) {
4164	last_found = true;
4165	break;
4166	}
4167
4168	di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
4169
4170	/*
4171	* Skip ranges of items that consist only of dir item keys created
4172	* in past transactions. However if we find a gap, we must log a
4173	* dir index range item for that gap, so that index keys in that
4174	* gap are deleted during log replay.
4175	*/
4176	if (btrfs_dir_transid(eb: src, s: di) < trans->transid) {
4177	if (key.offset > *last_old_dentry_offset + `1`) {
4178	ret = insert_dir_log_key(trans, log, path: dst_path,
4179	dirid: ino, first_offset: *last_old_dentry_offset + `1`,
4180	last_offset: key.offset - `1`);
4181	if (ret < `0`)
4182	return ret;
4183	}
4184
4185	*last_old_dentry_offset = key.offset;
4186	continue;
4187	}
4188
4189	/ If we logged this dir index item before, we can skip it. /
4190	if (key.offset <= inode->last_dir_index_offset)
4191	continue;
4192
4193	/*
4194	* We must make sure that when we log a directory entry, the
4195	* corresponding inode, after log replay, has a matching link
4196	* count. For example:
4197	*
4198	* touch foo
4199	* mkdir mydir
4200	* sync
4201	* ln foo mydir/bar
4202	* xfs_io -c "fsync" mydir
4203	* <crash>
4204	* <mount fs and log replay>
4205	*
4206	* Would result in a fsync log that when replayed, our file inode
4207	* would have a link count of 1, but we get two directory entries
4208	* pointing to the same inode. After removing one of the names,
4209	* it would not be possible to remove the other name, which
4210	* resulted always in stale file handle errors, and would not be
4211	* possible to rmdir the parent directory, since its i_size could
4212	* never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
4213	* resulting in -ENOTEMPTY errors.
4214	*/
4215	if (!ctx->log_new_dentries) {
4216	struct btrfs_key di_key;
4217
4218	btrfs_dir_item_key_to_cpu(eb: src, item: di, cpu_key: &di_key);
4219	if (di_key.type != BTRFS_ROOT_ITEM_KEY)
4220	ctx->log_new_dentries = true;
4221	}
4222
4223	if (batch_size == `0`)
4224	batch_start = i;
4225	batch_size++;
4226	}
4227
4228	if (batch_size > `0`) {
4229	ret = flush_dir_items_batch(trans, inode, src, dst_path,
4230	start_slot: batch_start, count: batch_size);
4231	if (ret < `0`)
4232	return ret;
4233	}
4234
4235	return last_found ? `1` : `0`;
4236	}
4237
4238	/*
4239	* log all the items included in the current transaction for a given
4240	* directory. This also creates the range items in the log tree required
4241	* to replay anything deleted before the fsync
4242	*/
4243	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
4244	struct btrfs_inode *inode,
4245	struct btrfs_path *path,
4246	struct btrfs_path *dst_path,
4247	struct btrfs_log_ctx *ctx,
4248	u64 min_offset, u64 *last_offset_ret)
4249	{
4250	struct btrfs_key min_key;
4251	struct btrfs_root *root = inode->root;
4252	struct btrfs_root *log = root->log_root;
4253	int ret;
4254	u64 last_old_dentry_offset = min_offset - `1`;
4255	u64 last_offset = (u64)-`1`;
4256	u64 ino = btrfs_ino(inode);
4257
4258	min_key.objectid = ino;
4259	min_key.type = BTRFS_DIR_INDEX_KEY;
4260	min_key.offset = min_offset;
4261
4262	ret = btrfs_search_forward(root, min_key: &min_key, path, min_trans: trans->transid);
4263
4264	/*
4265	* we didn't find anything from this transaction, see if there
4266	* is anything at all
4267	*/
4268	if (ret != `0` \|\| min_key.objectid != ino \|\|
4269	min_key.type != BTRFS_DIR_INDEX_KEY) {
4270	min_key.objectid = ino;
4271	min_key.type = BTRFS_DIR_INDEX_KEY;
4272	min_key.offset = (u64)-`1`;
4273	btrfs_release_path(p: path);
4274	ret = btrfs_search_slot(NULL, root, key: &min_key, p: path, ins_len: `0`, cow: `0`);
4275	if (ret < `0`) {
4276	btrfs_release_path(p: path);
4277	return ret;
4278	}
4279	ret = btrfs_previous_item(root, path, min_objectid: ino, BTRFS_DIR_INDEX_KEY);
4280
4281	/ if ret == 0 there are items for this type,*
4282	* create a range to tell us the last key of this type.
4283	* otherwise, there are no items in this directory after
4284	* *min_offset, and we create a range to indicate that.
4285	*/
4286	if (ret == `0`) {
4287	struct btrfs_key tmp;
4288
4289	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &tmp,
4290	nr: path->slots[`0`]);
4291	if (tmp.type == BTRFS_DIR_INDEX_KEY)
4292	last_old_dentry_offset = tmp.offset;
4293	} else if (ret > `0`) {
4294	ret = `0`;
4295	}
4296
4297	goto done;
4298	}
4299
4300	/ go backward to find any previous key /
4301	ret = btrfs_previous_item(root, path, min_objectid: ino, BTRFS_DIR_INDEX_KEY);
4302	if (ret == `0`) {
4303	struct btrfs_key tmp;
4304
4305	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &tmp, nr: path->slots[`0`]);
4306	/*
4307	* The dir index key before the first one we found that needs to
4308	* be logged might be in a previous leaf, and there might be a
4309	* gap between these keys, meaning that we had deletions that
4310	* happened. So the key range item we log (key type
4311	* BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
4312	* previous key's offset plus 1, so that those deletes are replayed.
4313	*/
4314	if (tmp.type == BTRFS_DIR_INDEX_KEY)
4315	last_old_dentry_offset = tmp.offset;
4316	} else if (ret < `0`) {
4317	goto done;
4318	}
4319
4320	btrfs_release_path(p: path);
4321
4322	/*
4323	* Find the first key from this transaction again or the one we were at
4324	* in the loop below in case we had to reschedule. We may be logging the
4325	* directory without holding its VFS lock, which happen when logging new
4326	* dentries (through log_new_dir_dentries()) or in some cases when we
4327	* need to log the parent directory of an inode. This means a dir index
4328	* key might be deleted from the inode's root, and therefore we may not
4329	* find it anymore. If we can't find it, just move to the next key. We
4330	* can not bail out and ignore, because if we do that we will simply
4331	* not log dir index keys that come after the one that was just deleted
4332	* and we can end up logging a dir index range that ends at (u64)-1
4333	* (@last_offset is initialized to that), resulting in removing dir
4334	* entries we should not remove at log replay time.
4335	*/
4336	search:
4337	ret = btrfs_search_slot(NULL, root, key: &min_key, p: path, ins_len: `0`, cow: `0`);
4338	if (ret > `0`) {
4339	ret = btrfs_next_item(root, p: path);
4340	if (ret > `0`) {
4341	/ There are no more keys in the inode's root. /
4342	ret = `0`;
4343	goto done;
4344	}
4345	}
4346	if (ret < `0`)
4347	goto done;
4348
4349	/*
4350	* we have a block from this transaction, log every item in it
4351	* from our directory
4352	*/
4353	while (`1`) {
4354	ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
4355	last_old_dentry_offset: &last_old_dentry_offset);
4356	if (ret != `0`) {
4357	if (ret > `0`)
4358	ret = `0`;
4359	goto done;
4360	}
4361	path->slots[`0`] = btrfs_header_nritems(eb: path->nodes[`0`]);
4362
4363	/*
4364	* look ahead to the next item and see if it is also
4365	* from this directory and from this transaction
4366	*/
4367	ret = btrfs_next_leaf(root, path);
4368	if (ret) {
4369	if (ret == `1`) {
4370	last_offset = (u64)-`1`;
4371	ret = `0`;
4372	}
4373	goto done;
4374	}
4375	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &min_key, nr: path->slots[`0`]);
4376	if (min_key.objectid != ino \|\| min_key.type != BTRFS_DIR_INDEX_KEY) {
4377	last_offset = (u64)-`1`;
4378	goto done;
4379	}
4380	if (btrfs_header_generation(eb: path->nodes[`0`]) != trans->transid) {
4381	/*
4382	* The next leaf was not changed in the current transaction
4383	* and has at least one dir index key.
4384	* We check for the next key because there might have been
4385	* one or more deletions between the last key we logged and
4386	* that next key. So the key range item we log (key type
4387	* BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
4388	* offset minus 1, so that those deletes are replayed.
4389	*/
4390	last_offset = min_key.offset - `1`;
4391	goto done;
4392	}
4393	if (need_resched()) {
4394	btrfs_release_path(p: path);
4395	cond_resched();
4396	goto search;
4397	}
4398	}
4399	done:
4400	btrfs_release_path(p: path);
4401	btrfs_release_path(p: dst_path);
4402
4403	if (ret == `0`) {
4404	*last_offset_ret = last_offset;
4405	/*
4406	* In case the leaf was changed in the current transaction but
4407	* all its dir items are from a past transaction, the last item
4408	* in the leaf is a dir item and there's no gap between that last
4409	* dir item and the first one on the next leaf (which did not
4410	* change in the current transaction), then we don't need to log
4411	* a range, last_old_dentry_offset is == to last_offset.
4412	*/
4413	ASSERT(last_old_dentry_offset <= last_offset,
4414	"last_old_dentry_offset=%llu last_offset=%llu",
4415	last_old_dentry_offset, last_offset);
4416	if (last_old_dentry_offset < last_offset)
4417	ret = insert_dir_log_key(trans, log, path, dirid: ino,
4418	first_offset: last_old_dentry_offset + `1`,
4419	last_offset);
4420	}
4421
4422	return ret;
4423	}
4424
4425	/*
4426	* If the inode was logged before and it was evicted, then its
4427	* last_dir_index_offset is 0, so we don't know the value of the last index
4428	* key offset. If that's the case, search for it and update the inode. This
4429	* is to avoid lookups in the log tree every time we try to insert a dir index
4430	* key from a leaf changed in the current transaction, and to allow us to always
4431	* do batch insertions of dir index keys.
4432	*/
4433	static int update_last_dir_index_offset(struct btrfs_inode *inode,
4434	struct btrfs_path *path,
4435	const struct btrfs_log_ctx *ctx)
4436	{
4437	const u64 ino = btrfs_ino(inode);
4438	struct btrfs_key key;
4439	int ret;
4440
4441	lockdep_assert_held(&inode->log_mutex);
4442
4443	if (inode->last_dir_index_offset != `0`)
4444	return `0`;
4445
4446	if (!ctx->logged_before) {
4447	inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - `1`;
4448	return `0`;
4449	}
4450
4451	key.objectid = ino;
4452	key.type = BTRFS_DIR_INDEX_KEY;
4453	key.offset = (u64)-`1`;
4454
4455	ret = btrfs_search_slot(NULL, root: inode->root->log_root, key: &key, p: path, ins_len: `0`, cow: `0`);
4456	/*
4457	* An error happened or we actually have an index key with an offset
4458	* value of (u64)-1. Bail out, we're done.
4459	*/
4460	if (ret <= `0`)
4461	goto out;
4462
4463	ret = `0`;
4464	inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - `1`;
4465
4466	/*
4467	* No dir index items, bail out and leave last_dir_index_offset with
4468	* the value right before the first valid index value.
4469	*/
4470	if (path->slots[`0`] == `0`)
4471	goto out;
4472
4473	/*
4474	* btrfs_search_slot() left us at one slot beyond the slot with the last
4475	* index key, or beyond the last key of the directory that is not an
4476	* index key. If we have an index key before, set last_dir_index_offset
4477	* to its offset value, otherwise leave it with a value right before the
4478	* first valid index value, as it means we have an empty directory.
4479	*/
4480	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`] - `1`);
4481	if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4482	inode->last_dir_index_offset = key.offset;
4483
4484	out:
4485	btrfs_release_path(p: path);
4486
4487	return ret;
4488	}
4489
4490	/*
4491	* logging directories is very similar to logging inodes, We find all the items
4492	* from the current transaction and write them to the log.
4493	*
4494	* The recovery code scans the directory in the subvolume, and if it finds a
4495	* key in the range logged that is not present in the log tree, then it means
4496	* that dir entry was unlinked during the transaction.
4497	*
4498	* In order for that scan to work, we must include one key smaller than
4499	* the smallest logged by this transaction and one key larger than the largest
4500	* key logged by this transaction.
4501	*/
4502	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4503	struct btrfs_inode *inode,
4504	struct btrfs_path *path,
4505	struct btrfs_path *dst_path,
4506	struct btrfs_log_ctx *ctx)
4507	{
4508	u64 min_key;
4509	u64 max_key;
4510	int ret;
4511
4512	ret = update_last_dir_index_offset(inode, path, ctx);
4513	if (ret)
4514	return ret;
4515
4516	min_key = BTRFS_DIR_START_INDEX;
4517	max_key = `0`;
4518
4519	while (`1`) {
4520	ret = log_dir_items(trans, inode, path, dst_path,
4521	ctx, min_offset: min_key, last_offset_ret: &max_key);
4522	if (ret)
4523	return ret;
4524	if (max_key == (u64)-`1`)
4525	break;
4526	min_key = max_key + `1`;
4527	}
4528
4529	return `0`;
4530	}
4531
4532	/*
4533	* a helper function to drop items from the log before we relog an
4534	* inode. max_key_type indicates the highest item type to remove.
4535	* This cannot be run for file data extents because it does not
4536	* free the extents they point to.
4537	*/
4538	static int drop_inode_items(struct btrfs_trans_handle *trans,
4539	struct btrfs_root *log,
4540	struct btrfs_path *path,
4541	struct btrfs_inode *inode,
4542	int max_key_type)
4543	{
4544	int ret;
4545	struct btrfs_key key;
4546	struct btrfs_key found_key;
4547	int start_slot;
4548
4549	key.objectid = btrfs_ino(inode);
4550	key.type = max_key_type;
4551	key.offset = (u64)-`1`;
4552
4553	while (`1`) {
4554	ret = btrfs_search_slot(trans, root: log, key: &key, p: path, ins_len: -`1`, cow: `1`);
4555	if (ret < `0`) {
4556	break;
4557	} else if (ret > `0`) {
4558	if (path->slots[`0`] == `0`)
4559	break;
4560	path->slots[`0`]--;
4561	}
4562
4563	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &found_key,
4564	nr: path->slots[`0`]);
4565
4566	if (found_key.objectid != key.objectid)
4567	break;
4568
4569	found_key.offset = `0`;
4570	found_key.type = `0`;
4571	ret = btrfs_bin_search(eb: path->nodes[`0`], first_slot: `0`, key: &found_key, slot: &start_slot);
4572	if (ret < `0`)
4573	break;
4574
4575	ret = btrfs_del_items(trans, root: log, path, slot: start_slot,
4576	nr: path->slots[`0`] - start_slot + `1`);
4577	/*
4578	* If start slot isn't 0 then we don't need to re-search, we've
4579	* found the last guy with the objectid in this tree.
4580	*/
4581	if (ret \|\| start_slot != `0`)
4582	break;
4583	btrfs_release_path(p: path);
4584	}
4585	btrfs_release_path(p: path);
4586	if (ret > `0`)
4587	ret = `0`;
4588	return ret;
4589	}
4590
4591	static int truncate_inode_items(struct btrfs_trans_handle *trans,
4592	struct btrfs_root *log_root,
4593	struct btrfs_inode *inode,
4594	u64 new_size, u32 min_type)
4595	{
4596	struct btrfs_truncate_control control = {
4597	.new_size = new_size,
4598	.ino = btrfs_ino(inode),
4599	.min_type = min_type,
4600	.skip_ref_updates = true,
4601	};
4602
4603	return btrfs_truncate_inode_items(trans, root: log_root, control: &control);
4604	}
4605
4606	static void fill_inode_item(struct btrfs_trans_handle *trans,
4607	struct extent_buffer *leaf,
4608	struct btrfs_inode_item *item,
4609	struct inode *inode, bool log_inode_only,
4610	u64 logged_isize)
4611	{
4612	u64 flags;
4613
4614	if (log_inode_only) {
4615	/ set the generation to zero so the recover code*
4616	* can tell the difference between an logging
4617	* just to say 'this inode exists' and a logging
4618	* to say 'update this inode with these values'
4619	*/
4620	btrfs_set_inode_generation(eb: leaf, s: item, val: `0`);
4621	btrfs_set_inode_size(eb: leaf, s: item, val: logged_isize);
4622	} else {
4623	btrfs_set_inode_generation(eb: leaf, s: item, BTRFS_I(inode)->generation);
4624	btrfs_set_inode_size(eb: leaf, s: item, val: inode->i_size);
4625	}
4626
4627	btrfs_set_inode_uid(eb: leaf, s: item, val: i_uid_read(inode));
4628	btrfs_set_inode_gid(eb: leaf, s: item, val: i_gid_read(inode));
4629	btrfs_set_inode_mode(eb: leaf, s: item, val: inode->i_mode);
4630	btrfs_set_inode_nlink(eb: leaf, s: item, val: inode->i_nlink);
4631
4632	btrfs_set_timespec_sec(eb: leaf, s: &item->atime, val: inode_get_atime_sec(inode));
4633	btrfs_set_timespec_nsec(eb: leaf, s: &item->atime, val: inode_get_atime_nsec(inode));
4634
4635	btrfs_set_timespec_sec(eb: leaf, s: &item->mtime, val: inode_get_mtime_sec(inode));
4636	btrfs_set_timespec_nsec(eb: leaf, s: &item->mtime, val: inode_get_mtime_nsec(inode));
4637
4638	btrfs_set_timespec_sec(eb: leaf, s: &item->ctime, val: inode_get_ctime_sec(inode));
4639	btrfs_set_timespec_nsec(eb: leaf, s: &item->ctime, val: inode_get_ctime_nsec(inode));
4640
4641	btrfs_set_timespec_sec(eb: leaf, s: &item->otime, BTRFS_I(inode)->i_otime_sec);
4642	btrfs_set_timespec_nsec(eb: leaf, s: &item->otime, BTRFS_I(inode)->i_otime_nsec);
4643
4644	/*
4645	* We do not need to set the nbytes field, in fact during a fast fsync
4646	* its value may not even be correct, since a fast fsync does not wait
4647	* for ordered extent completion, which is where we update nbytes, it
4648	* only waits for writeback to complete. During log replay as we find
4649	* file extent items and replay them, we adjust the nbytes field of the
4650	* inode item in subvolume tree as needed (see overwrite_item()).
4651	*/
4652
4653	btrfs_set_inode_sequence(eb: leaf, s: item, val: inode_peek_iversion(inode));
4654	btrfs_set_inode_transid(eb: leaf, s: item, val: trans->transid);
4655	btrfs_set_inode_rdev(eb: leaf, s: item, val: inode->i_rdev);
4656	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4657	BTRFS_I(inode)->ro_flags);
4658	btrfs_set_inode_flags(eb: leaf, s: item, val: flags);
4659	btrfs_set_inode_block_group(eb: leaf, s: item, val: `0`);
4660	}
4661
4662	static int log_inode_item(struct btrfs_trans_handle *trans,
4663	struct btrfs_root log, struct* btrfs_path *path,
4664	struct btrfs_inode *inode, bool inode_item_dropped)
4665	{
4666	struct btrfs_inode_item *inode_item;
4667	struct btrfs_key key;
4668	int ret;
4669
4670	btrfs_get_inode_key(inode, key: &key);
4671	/*
4672	* If we are doing a fast fsync and the inode was logged before in the
4673	* current transaction, then we know the inode was previously logged and
4674	* it exists in the log tree. For performance reasons, in this case use
4675	* btrfs_search_slot() directly with ins_len set to 0 so that we never
4676	* attempt a write lock on the leaf's parent, which adds unnecessary lock
4677	* contention in case there are concurrent fsyncs for other inodes of the
4678	* same subvolume. Using btrfs_insert_empty_item() when the inode item
4679	* already exists can also result in unnecessarily splitting a leaf.
4680	*/
4681	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4682	ret = btrfs_search_slot(trans, root: log, key: &key, p: path, ins_len: `0`, cow: `1`);
4683	ASSERT(ret <= `0`);
4684	if (ret > `0`)
4685	ret = -ENOENT;
4686	} else {
4687	/*
4688	* This means it is the first fsync in the current transaction,
4689	* so the inode item is not in the log and we need to insert it.
4690	* We can never get -EEXIST because we are only called for a fast
4691	* fsync and in case an inode eviction happens after the inode was
4692	* logged before in the current transaction, when we load again
4693	* the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4694	* flags and set ->logged_trans to 0.
4695	*/
4696	ret = btrfs_insert_empty_item(trans, root: log, path, key: &key,
4697	data_size: sizeof(*inode_item));
4698	ASSERT(ret != -EEXIST);
4699	}
4700	if (ret)
4701	return ret;
4702	inode_item = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
4703	struct btrfs_inode_item);
4704	fill_inode_item(trans, leaf: path->nodes[`0`], item: inode_item, inode: &inode->vfs_inode,
4705	log_inode_only: false, logged_isize: `0`);
4706	btrfs_release_path(p: path);
4707	return `0`;
4708	}
4709
4710	static int log_csums(struct btrfs_trans_handle *trans,
4711	struct btrfs_inode *inode,
4712	struct btrfs_root *log_root,
4713	struct btrfs_ordered_sum *sums)
4714	{
4715	const u64 lock_end = sums->logical + sums->len - `1`;
4716	struct extent_state *cached_state = NULL;
4717	int ret;
4718
4719	/*
4720	* If this inode was not used for reflink operations in the current
4721	* transaction with new extents, then do the fast path, no need to
4722	* worry about logging checksum items with overlapping ranges.
4723	*/
4724	if (inode->last_reflink_trans < trans->transid)
4725	return btrfs_csum_file_blocks(trans, root: log_root, sums);
4726
4727	/*
4728	* Serialize logging for checksums. This is to avoid racing with the
4729	* same checksum being logged by another task that is logging another
4730	* file which happens to refer to the same extent as well. Such races
4731	* can leave checksum items in the log with overlapping ranges.
4732	*/
4733	ret = btrfs_lock_extent(tree: &log_root->log_csum_range, start: sums->logical, end: lock_end,
4734	cached: &cached_state);
4735	if (ret)
4736	return ret;
4737	/*
4738	* Due to extent cloning, we might have logged a csum item that covers a
4739	* subrange of a cloned extent, and later we can end up logging a csum
4740	* item for a larger subrange of the same extent or the entire range.
4741	* This would leave csum items in the log tree that cover the same range
4742	* and break the searches for checksums in the log tree, resulting in
4743	* some checksums missing in the fs/subvolume tree. So just delete (or
4744	* trim and adjust) any existing csum items in the log for this range.
4745	*/
4746	ret = btrfs_del_csums(trans, root: log_root, bytenr: sums->logical, len: sums->len);
4747	if (!ret)
4748	ret = btrfs_csum_file_blocks(trans, root: log_root, sums);
4749
4750	btrfs_unlock_extent(tree: &log_root->log_csum_range, start: sums->logical, end: lock_end,
4751	cached: &cached_state);
4752
4753	return ret;
4754	}
4755
4756	static noinline int copy_items(struct btrfs_trans_handle *trans,
4757	struct btrfs_inode *inode,
4758	struct btrfs_path *dst_path,
4759	struct btrfs_path *src_path,
4760	int start_slot, int nr, int inode_only,
4761	u64 logged_isize, struct btrfs_log_ctx *ctx)
4762	{
4763	struct btrfs_root *log = inode->root->log_root;
4764	struct btrfs_file_extent_item *extent;
4765	struct extent_buffer *src;
4766	int ret;
4767	struct btrfs_key *ins_keys;
4768	u32 *ins_sizes;
4769	struct btrfs_item_batch batch;
4770	char AUTO_KFREE(ins_data);
4771	int dst_index;
4772	const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4773	const u64 i_size = i_size_read(inode: &inode->vfs_inode);
4774
4775	/*
4776	* To keep lockdep happy and avoid deadlocks, clone the source leaf and
4777	* use the clone. This is because otherwise we would be changing the log
4778	* tree, to insert items from the subvolume tree or insert csum items,
4779	* while holding a read lock on a leaf from the subvolume tree, which
4780	* creates a nasty lock dependency when COWing log tree nodes/leaves:
4781	*
4782	* 1) Modifying the log tree triggers an extent buffer allocation while
4783	* holding a write lock on a parent extent buffer from the log tree.
4784	* Allocating the pages for an extent buffer, or the extent buffer
4785	* struct, can trigger inode eviction and finally the inode eviction
4786	* will trigger a release/remove of a delayed node, which requires
4787	* taking the delayed node's mutex;
4788	*
4789	* 2) Allocating a metadata extent for a log tree can trigger the async
4790	* reclaim thread and make us wait for it to release enough space and
4791	* unblock our reservation ticket. The reclaim thread can start
4792	* flushing delayed items, and that in turn results in the need to
4793	* lock delayed node mutexes and in the need to write lock extent
4794	* buffers of a subvolume tree - all this while holding a write lock
4795	* on the parent extent buffer in the log tree.
4796	*
4797	* So one task in scenario 1) running in parallel with another task in
4798	* scenario 2) could lead to a deadlock, one wanting to lock a delayed
4799	* node mutex while having a read lock on a leaf from the subvolume,
4800	* while the other is holding the delayed node's mutex and wants to
4801	* write lock the same subvolume leaf for flushing delayed items.
4802	*/
4803	ret = clone_leaf(path: src_path, ctx);
4804	if (ret < `0`)
4805	return ret;
4806
4807	src = src_path->nodes[`0`];
4808
4809	ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
4810	if (!ins_data)
4811	return -ENOMEM;
4812
4813	ins_sizes = (u32 *)ins_data;
4814	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
4815	batch.keys = ins_keys;
4816	batch.data_sizes = ins_sizes;
4817	batch.total_data_size = `0`;
4818	batch.nr = `0`;
4819
4820	dst_index = `0`;
4821	for (int i = `0`; i < nr; i++) {
4822	const int src_slot = start_slot + i;
4823	struct btrfs_root *csum_root;
4824	struct btrfs_ordered_sum *sums;
4825	struct btrfs_ordered_sum *sums_next;
4826	LIST_HEAD(ordered_sums);
4827	u64 disk_bytenr;
4828	u64 disk_num_bytes;
4829	u64 extent_offset;
4830	u64 extent_num_bytes;
4831	bool is_old_extent;
4832
4833	btrfs_item_key_to_cpu(eb: src, cpu_key: &ins_keys[dst_index], nr: src_slot);
4834
4835	if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4836	goto add_to_batch;
4837
4838	extent = btrfs_item_ptr(src, src_slot,
4839	struct btrfs_file_extent_item);
4840
4841	is_old_extent = (btrfs_file_extent_generation(eb: src, s: extent) <
4842	trans->transid);
4843
4844	/*
4845	* Don't copy extents from past generations. That would make us
4846	* log a lot more metadata for common cases like doing only a
4847	* few random writes into a file and then fsync it for the first
4848	* time or after the full sync flag is set on the inode. We can
4849	* get leaves full of extent items, most of which are from past
4850	* generations, so we can skip them - as long as the inode has
4851	* not been the target of a reflink operation in this transaction,
4852	* as in that case it might have had file extent items with old
4853	* generations copied into it. We also must always log prealloc
4854	* extents that start at or beyond eof, otherwise we would lose
4855	* them on log replay.
4856	*/
4857	if (is_old_extent &&
4858	ins_keys[dst_index].offset < i_size &&
4859	inode->last_reflink_trans < trans->transid)
4860	continue;
4861
4862	if (skip_csum)
4863	goto add_to_batch;
4864
4865	/ Only regular extents have checksums. /
4866	if (btrfs_file_extent_type(eb: src, s: extent) != BTRFS_FILE_EXTENT_REG)
4867	goto add_to_batch;
4868
4869	/*
4870	* If it's an extent created in a past transaction, then its
4871	* checksums are already accessible from the committed csum tree,
4872	* no need to log them.
4873	*/
4874	if (is_old_extent)
4875	goto add_to_batch;
4876
4877	disk_bytenr = btrfs_file_extent_disk_bytenr(eb: src, s: extent);
4878	/ If it's an explicit hole, there are no checksums. /
4879	if (disk_bytenr == `0`)
4880	goto add_to_batch;
4881
4882	disk_num_bytes = btrfs_file_extent_disk_num_bytes(eb: src, s: extent);
4883
4884	if (btrfs_file_extent_compression(eb: src, s: extent)) {
4885	extent_offset = `0`;
4886	extent_num_bytes = disk_num_bytes;
4887	} else {
4888	extent_offset = btrfs_file_extent_offset(eb: src, s: extent);
4889	extent_num_bytes = btrfs_file_extent_num_bytes(eb: src, s: extent);
4890	}
4891
4892	csum_root = btrfs_csum_root(fs_info: trans->fs_info, bytenr: disk_bytenr);
4893	disk_bytenr += extent_offset;
4894	ret = btrfs_lookup_csums_list(root: csum_root, start: disk_bytenr,
4895	end: disk_bytenr + extent_num_bytes - `1`,
4896	list: &ordered_sums, nowait: false);
4897	if (ret < `0`)
4898	return ret;
4899	ret = `0`;
4900
4901	list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4902	if (!ret)
4903	ret = log_csums(trans, inode, log_root: log, sums);
4904	list_del(entry: &sums->list);
4905	kfree(objp: sums);
4906	}
4907	if (ret)
4908	return ret;
4909
4910	add_to_batch:
4911	ins_sizes[dst_index] = btrfs_item_size(eb: src, slot: src_slot);
4912	batch.total_data_size += ins_sizes[dst_index];
4913	batch.nr++;
4914	dst_index++;
4915	}
4916
4917	/*
4918	* We have a leaf full of old extent items that don't need to be logged,
4919	* so we don't need to do anything.
4920	*/
4921	if (batch.nr == `0`)
4922	return `0`;
4923
4924	ret = btrfs_insert_empty_items(trans, root: log, path: dst_path, batch: &batch);
4925	if (ret)
4926	return ret;
4927
4928	dst_index = `0`;
4929	for (int i = `0`; i < nr; i++) {
4930	const int src_slot = start_slot + i;
4931	const int dst_slot = dst_path->slots[`0`] + dst_index;
4932	struct btrfs_key key;
4933	unsigned long src_offset;
4934	unsigned long dst_offset;
4935
4936	/*
4937	* We're done, all the remaining items in the source leaf
4938	* correspond to old file extent items.
4939	*/
4940	if (dst_index >= batch.nr)
4941	break;
4942
4943	btrfs_item_key_to_cpu(eb: src, cpu_key: &key, nr: src_slot);
4944
4945	if (key.type != BTRFS_EXTENT_DATA_KEY)
4946	goto copy_item;
4947
4948	extent = btrfs_item_ptr(src, src_slot,
4949	struct btrfs_file_extent_item);
4950
4951	/ See the comment in the previous loop, same logic. /
4952	if (btrfs_file_extent_generation(eb: src, s: extent) < trans->transid &&
4953	key.offset < i_size &&
4954	inode->last_reflink_trans < trans->transid)
4955	continue;
4956
4957	copy_item:
4958	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[`0`], dst_slot);
4959	src_offset = btrfs_item_ptr_offset(src, src_slot);
4960
4961	if (key.type == BTRFS_INODE_ITEM_KEY) {
4962	struct btrfs_inode_item *inode_item;
4963
4964	inode_item = btrfs_item_ptr(dst_path->nodes[`0`], dst_slot,
4965	struct btrfs_inode_item);
4966	fill_inode_item(trans, leaf: dst_path->nodes[`0`], item: inode_item,
4967	inode: &inode->vfs_inode,
4968	log_inode_only: inode_only == LOG_INODE_EXISTS,
4969	logged_isize);
4970	} else {
4971	copy_extent_buffer(dst: dst_path->nodes[`0`], src, dst_offset,
4972	src_offset, len: ins_sizes[dst_index]);
4973	}
4974
4975	dst_index++;
4976	}
4977
4978	btrfs_release_path(p: dst_path);
4979
4980	return ret;
4981	}
4982
4983	static int extent_cmp(void priv, const* struct list_head *a,
4984	const struct list_head *b)
4985	{
4986	const struct extent_map em1, em2;
4987
4988	em1 = list_entry(a, struct extent_map, list);
4989	em2 = list_entry(b, struct extent_map, list);
4990
4991	if (em1->start < em2->start)
4992	return -`1`;
4993	else if (em1->start > em2->start)
4994	return `1`;
4995	return `0`;
4996	}
4997
4998	static int log_extent_csums(struct btrfs_trans_handle *trans,
4999	struct btrfs_inode *inode,
5000	struct btrfs_root *log_root,
5001	const struct extent_map *em,
5002	struct btrfs_log_ctx *ctx)
5003	{
5004	struct btrfs_ordered_extent *ordered;
5005	struct btrfs_root *csum_root;
5006	u64 block_start;
5007	u64 csum_offset;
5008	u64 csum_len;
5009	u64 mod_start = em->start;
5010	u64 mod_len = em->len;
5011	LIST_HEAD(ordered_sums);
5012	int ret = `0`;
5013
5014	if (inode->flags & BTRFS_INODE_NODATASUM \|\|
5015	(em->flags & EXTENT_FLAG_PREALLOC) \|\|
5016	em->disk_bytenr == EXTENT_MAP_HOLE)
5017	return `0`;
5018
5019	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
5020	const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
5021	const u64 mod_end = mod_start + mod_len;
5022	struct btrfs_ordered_sum *sums;
5023
5024	if (mod_len == `0`)
5025	break;
5026
5027	if (ordered_end <= mod_start)
5028	continue;
5029	if (mod_end <= ordered->file_offset)
5030	break;
5031
5032	/*
5033	* We are going to copy all the csums on this ordered extent, so
5034	* go ahead and adjust mod_start and mod_len in case this ordered
5035	* extent has already been logged.
5036	*/
5037	if (ordered->file_offset > mod_start) {
5038	if (ordered_end >= mod_end)
5039	mod_len = ordered->file_offset - mod_start;
5040	/*
5041	* If we have this case
5042	*
5043	* \|--------- logged extent ---------\|
5044	* \|----- ordered extent ----\|
5045	*
5046	* Just don't mess with mod_start and mod_len, we'll
5047	* just end up logging more csums than we need and it
5048	* will be ok.
5049	*/
5050	} else {
5051	if (ordered_end < mod_end) {
5052	mod_len = mod_end - ordered_end;
5053	mod_start = ordered_end;
5054	} else {
5055	mod_len = `0`;
5056	}
5057	}
5058
5059	/*
5060	* To keep us from looping for the above case of an ordered
5061	* extent that falls inside of the logged extent.
5062	*/
5063	if (test_and_set_bit(nr: BTRFS_ORDERED_LOGGED_CSUM, addr: &ordered->flags))
5064	continue;
5065
5066	list_for_each_entry(sums, &ordered->list, list) {
5067	ret = log_csums(trans, inode, log_root, sums);
5068	if (ret)
5069	return ret;
5070	}
5071	}
5072
5073	/ We're done, found all csums in the ordered extents. /
5074	if (mod_len == `0`)
5075	return `0`;
5076
5077	/ If we're compressed we have to save the entire range of csums. /
5078	if (btrfs_extent_map_is_compressed(em)) {
5079	csum_offset = `0`;
5080	csum_len = em->disk_num_bytes;
5081	} else {
5082	csum_offset = mod_start - em->start;
5083	csum_len = mod_len;
5084	}
5085
5086	/ block start is already adjusted for the file extent offset. /
5087	block_start = btrfs_extent_map_block_start(em);
5088	csum_root = btrfs_csum_root(fs_info: trans->fs_info, bytenr: block_start);
5089	ret = btrfs_lookup_csums_list(root: csum_root, start: block_start + csum_offset,
5090	end: block_start + csum_offset + csum_len - `1`,
5091	list: &ordered_sums, nowait: false);
5092	if (ret < `0`)
5093	return ret;
5094	ret = `0`;
5095
5096	while (!list_empty(head: &ordered_sums)) {
5097	struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
5098	struct btrfs_ordered_sum,
5099	list);
5100	if (!ret)
5101	ret = log_csums(trans, inode, log_root, sums);
5102	list_del(entry: &sums->list);
5103	kfree(objp: sums);
5104	}
5105
5106	return ret;
5107	}
5108
5109	static int log_one_extent(struct btrfs_trans_handle *trans,
5110	struct btrfs_inode *inode,
5111	const struct extent_map *em,
5112	struct btrfs_path *path,
5113	struct btrfs_log_ctx *ctx)
5114	{
5115	struct btrfs_drop_extents_args drop_args = { `0` };
5116	struct btrfs_root *log = inode->root->log_root;
5117	struct btrfs_file_extent_item fi = { `0` };
5118	struct extent_buffer *leaf;
5119	struct btrfs_key key;
5120	enum btrfs_compression_type compress_type;
5121	u64 extent_offset = em->offset;
5122	u64 block_start = btrfs_extent_map_block_start(em);
5123	u64 block_len;
5124	int ret;
5125
5126	btrfs_set_stack_file_extent_generation(s: &fi, val: trans->transid);
5127	if (em->flags & EXTENT_FLAG_PREALLOC)
5128	btrfs_set_stack_file_extent_type(s: &fi, val: BTRFS_FILE_EXTENT_PREALLOC);
5129	else
5130	btrfs_set_stack_file_extent_type(s: &fi, val: BTRFS_FILE_EXTENT_REG);
5131
5132	block_len = em->disk_num_bytes;
5133	compress_type = btrfs_extent_map_compression(em);
5134	if (compress_type != BTRFS_COMPRESS_NONE) {
5135	btrfs_set_stack_file_extent_disk_bytenr(s: &fi, val: block_start);
5136	btrfs_set_stack_file_extent_disk_num_bytes(s: &fi, val: block_len);
5137	} else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
5138	btrfs_set_stack_file_extent_disk_bytenr(s: &fi, val: block_start - extent_offset);
5139	btrfs_set_stack_file_extent_disk_num_bytes(s: &fi, val: block_len);
5140	}
5141
5142	btrfs_set_stack_file_extent_offset(s: &fi, val: extent_offset);
5143	btrfs_set_stack_file_extent_num_bytes(s: &fi, val: em->len);
5144	btrfs_set_stack_file_extent_ram_bytes(s: &fi, val: em->ram_bytes);
5145	btrfs_set_stack_file_extent_compression(s: &fi, val: compress_type);
5146
5147	ret = log_extent_csums(trans, inode, log_root: log, em, ctx);
5148	if (ret)
5149	return ret;
5150
5151	/*
5152	* If this is the first time we are logging the inode in the current
5153	* transaction, we can avoid btrfs_drop_extents(), which is expensive
5154	* because it does a deletion search, which always acquires write locks
5155	* for extent buffers at levels 2, 1 and 0. This not only wastes time
5156	* but also adds significant contention in a log tree, since log trees
5157	* are small, with a root at level 2 or 3 at most, due to their short
5158	* life span.
5159	*/
5160	if (ctx->logged_before) {
5161	drop_args.path = path;
5162	drop_args.start = em->start;
5163	drop_args.end = em->start + em->len;
5164	drop_args.replace_extent = true;
5165	drop_args.extent_item_size = sizeof(fi);
5166	ret = btrfs_drop_extents(trans, root: log, inode, args: &drop_args);
5167	if (ret)
5168	return ret;
5169	}
5170
5171	if (!drop_args.extent_inserted) {
5172	key.objectid = btrfs_ino(inode);
5173	key.type = BTRFS_EXTENT_DATA_KEY;
5174	key.offset = em->start;
5175
5176	ret = btrfs_insert_empty_item(trans, root: log, path, key: &key,
5177	data_size: sizeof(fi));
5178	if (ret)
5179	return ret;
5180	}
5181	leaf = path->nodes[`0`];
5182	write_extent_buffer(eb: leaf, src: &fi,
5183	btrfs_item_ptr_offset(leaf, path->slots[`0`]),
5184	len: sizeof(fi));
5185
5186	btrfs_release_path(p: path);
5187
5188	return ret;
5189	}
5190
5191	/*
5192	* Log all prealloc extents beyond the inode's i_size to make sure we do not
5193	* lose them after doing a full/fast fsync and replaying the log. We scan the
5194	* subvolume's root instead of iterating the inode's extent map tree because
5195	* otherwise we can log incorrect extent items based on extent map conversion.
5196	* That can happen due to the fact that extent maps are merged when they
5197	* are not in the extent map tree's list of modified extents.
5198	*/
5199	static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
5200	struct btrfs_inode *inode,
5201	struct btrfs_path *path,
5202	struct btrfs_log_ctx *ctx)
5203	{
5204	struct btrfs_root *root = inode->root;
5205	struct btrfs_key key;
5206	const u64 i_size = i_size_read(inode: &inode->vfs_inode);
5207	const u64 ino = btrfs_ino(inode);
5208	BTRFS_PATH_AUTO_FREE(dst_path);
5209	bool dropped_extents = false;
5210	u64 truncate_offset = i_size;
5211	struct extent_buffer *leaf;
5212	int slot;
5213	int ins_nr = `0`;
5214	int start_slot = `0`;
5215	int ret;
5216
5217	if (!(inode->flags & BTRFS_INODE_PREALLOC))
5218	return `0`;
5219
5220	key.objectid = ino;
5221	key.type = BTRFS_EXTENT_DATA_KEY;
5222	key.offset = i_size;
5223	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
5224	if (ret < `0`)
5225	goto out;
5226
5227	/*
5228	* We must check if there is a prealloc extent that starts before the
5229	* i_size and crosses the i_size boundary. This is to ensure later we
5230	* truncate down to the end of that extent and not to the i_size, as
5231	* otherwise we end up losing part of the prealloc extent after a log
5232	* replay and with an implicit hole if there is another prealloc extent
5233	* that starts at an offset beyond i_size.
5234	*/
5235	ret = btrfs_previous_item(root, path, min_objectid: ino, BTRFS_EXTENT_DATA_KEY);
5236	if (ret < `0`)
5237	goto out;
5238
5239	if (ret == `0`) {
5240	struct btrfs_file_extent_item *ei;
5241
5242	leaf = path->nodes[`0`];
5243	slot = path->slots[`0`];
5244	ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5245
5246	if (btrfs_file_extent_type(eb: leaf, s: ei) ==
5247	BTRFS_FILE_EXTENT_PREALLOC) {
5248	u64 extent_end;
5249
5250	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
5251	extent_end = key.offset +
5252	btrfs_file_extent_num_bytes(eb: leaf, s: ei);
5253
5254	if (extent_end > i_size)
5255	truncate_offset = extent_end;
5256	}
5257	} else {
5258	ret = `0`;
5259	}
5260
5261	while (true) {
5262	leaf = path->nodes[`0`];
5263	slot = path->slots[`0`];
5264
5265	if (slot >= btrfs_header_nritems(eb: leaf)) {
5266	if (ins_nr > `0`) {
5267	ret = copy_items(trans, inode, dst_path, src_path: path,
5268	start_slot, nr: ins_nr, inode_only: `1`, logged_isize: `0`, ctx);
5269	if (ret < `0`)
5270	goto out;
5271	ins_nr = `0`;
5272	}
5273	ret = btrfs_next_leaf(root, path);
5274	if (ret < `0`)
5275	goto out;
5276	if (ret > `0`) {
5277	ret = `0`;
5278	break;
5279	}
5280	continue;
5281	}
5282
5283	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
5284	if (key.objectid > ino)
5285	break;
5286	if (WARN_ON_ONCE(key.objectid < ino) \|\|
5287	key.type < BTRFS_EXTENT_DATA_KEY \|\|
5288	key.offset < i_size) {
5289	path->slots[`0`]++;
5290	continue;
5291	}
5292	/*
5293	* Avoid overlapping items in the log tree. The first time we
5294	* get here, get rid of everything from a past fsync. After
5295	* that, if the current extent starts before the end of the last
5296	* extent we copied, truncate the last one. This can happen if
5297	* an ordered extent completion modifies the subvolume tree
5298	* while btrfs_next_leaf() has the tree unlocked.
5299	*/
5300	if (!dropped_extents \|\| key.offset < truncate_offset) {
5301	ret = truncate_inode_items(trans, log_root: root->log_root, inode,
5302	min(key.offset, truncate_offset),
5303	BTRFS_EXTENT_DATA_KEY);
5304	if (ret)
5305	goto out;
5306	dropped_extents = true;
5307	}
5308	truncate_offset = btrfs_file_extent_end(path);
5309	if (ins_nr == `0`)
5310	start_slot = slot;
5311	ins_nr++;
5312	path->slots[`0`]++;
5313	if (!dst_path) {
5314	dst_path = btrfs_alloc_path();
5315	if (!dst_path) {
5316	ret = -ENOMEM;
5317	goto out;
5318	}
5319	}
5320	}
5321	if (ins_nr > `0`)
5322	ret = copy_items(trans, inode, dst_path, src_path: path,
5323	start_slot, nr: ins_nr, inode_only: `1`, logged_isize: `0`, ctx);
5324	out:
5325	btrfs_release_path(p: path);
5326	return ret;
5327	}
5328
5329	static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
5330	struct btrfs_inode *inode,
5331	struct btrfs_path *path,
5332	struct btrfs_log_ctx *ctx)
5333	{
5334	struct btrfs_ordered_extent *ordered;
5335	struct btrfs_ordered_extent *tmp;
5336	struct extent_map em, n;
5337	LIST_HEAD(extents);
5338	struct extent_map_tree *tree = &inode->extent_tree;
5339	int ret = `0`;
5340	int num = `0`;
5341
5342	write_lock(&tree->lock);
5343
5344	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
5345	list_del_init(entry: &em->list);
5346	/*
5347	* Just an arbitrary number, this can be really CPU intensive
5348	* once we start getting a lot of extents, and really once we
5349	* have a bunch of extents we just want to commit since it will
5350	* be faster.
5351	*/
5352	if (++num > `32768`) {
5353	list_del_init(entry: &tree->modified_extents);
5354	ret = -EFBIG;
5355	goto process;
5356	}
5357
5358	if (em->generation < trans->transid)
5359	continue;
5360
5361	/ We log prealloc extents beyond eof later. /
5362	if ((em->flags & EXTENT_FLAG_PREALLOC) &&
5363	em->start >= i_size_read(inode: &inode->vfs_inode))
5364	continue;
5365
5366	/ Need a ref to keep it from getting evicted from cache /
5367	refcount_inc(r: &em->refs);
5368	em->flags \|= EXTENT_FLAG_LOGGING;
5369	list_add_tail(new: &em->list, head: &extents);
5370	num++;
5371	}
5372
5373	list_sort(NULL, head: &extents, cmp: extent_cmp);
5374	process:
5375	while (!list_empty(head: &extents)) {
5376	em = list_first_entry(&extents, struct extent_map, list);
5377
5378	list_del_init(entry: &em->list);
5379
5380	/*
5381	* If we had an error we just need to delete everybody from our
5382	* private list.
5383	*/
5384	if (ret) {
5385	btrfs_clear_em_logging(inode, em);
5386	btrfs_free_extent_map(em);
5387	continue;
5388	}
5389
5390	write_unlock(&tree->lock);
5391
5392	ret = log_one_extent(trans, inode, em, path, ctx);
5393	write_lock(&tree->lock);
5394	btrfs_clear_em_logging(inode, em);
5395	btrfs_free_extent_map(em);
5396	}
5397	WARN_ON(!list_empty(&extents));
5398	write_unlock(&tree->lock);
5399
5400	if (!ret)
5401	ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
5402	if (ret)
5403	return ret;
5404
5405	/*
5406	* We have logged all extents successfully, now make sure the commit of
5407	* the current transaction waits for the ordered extents to complete
5408	* before it commits and wipes out the log trees, otherwise we would
5409	* lose data if an ordered extents completes after the transaction
5410	* commits and a power failure happens after the transaction commit.
5411	*/
5412	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
5413	list_del_init(entry: &ordered->log_list);
5414	set_bit(nr: BTRFS_ORDERED_LOGGED, addr: &ordered->flags);
5415
5416	if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5417	spin_lock(lock: &inode->ordered_tree_lock);
5418	if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5419	set_bit(nr: BTRFS_ORDERED_PENDING, addr: &ordered->flags);
5420	atomic_inc(v: &trans->transaction->pending_ordered);
5421	}
5422	spin_unlock(lock: &inode->ordered_tree_lock);
5423	}
5424	btrfs_put_ordered_extent(entry: ordered);
5425	}
5426
5427	return `0`;
5428	}
5429
5430	static int logged_inode_size(struct btrfs_root log, struct* btrfs_inode *inode,
5431	struct btrfs_path path, u64 size_ret)
5432	{
5433	struct btrfs_key key;
5434	int ret;
5435
5436	key.objectid = btrfs_ino(inode);
5437	key.type = BTRFS_INODE_ITEM_KEY;
5438	key.offset = `0`;
5439
5440	ret = btrfs_search_slot(NULL, root: log, key: &key, p: path, ins_len: `0`, cow: `0`);
5441	if (ret < `0`) {
5442	return ret;
5443	} else if (ret > `0`) {
5444	*size_ret = `0`;
5445	} else {
5446	struct btrfs_inode_item *item;
5447
5448	item = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
5449	struct btrfs_inode_item);
5450	*size_ret = btrfs_inode_size(eb: path->nodes[`0`], s: item);
5451	/*
5452	* If the in-memory inode's i_size is smaller then the inode
5453	* size stored in the btree, return the inode's i_size, so
5454	* that we get a correct inode size after replaying the log
5455	* when before a power failure we had a shrinking truncate
5456	* followed by addition of a new name (rename / new hard link).
5457	* Otherwise return the inode size from the btree, to avoid
5458	* data loss when replaying a log due to previously doing a
5459	* write that expands the inode's size and logging a new name
5460	* immediately after.
5461	*/
5462	if (*size_ret > inode->vfs_inode.i_size)
5463	*size_ret = inode->vfs_inode.i_size;
5464	}
5465
5466	btrfs_release_path(p: path);
5467	return `0`;
5468	}
5469
5470	/*
5471	* At the moment we always log all xattrs. This is to figure out at log replay
5472	* time which xattrs must have their deletion replayed. If a xattr is missing
5473	* in the log tree and exists in the fs/subvol tree, we delete it. This is
5474	* because if a xattr is deleted, the inode is fsynced and a power failure
5475	* happens, causing the log to be replayed the next time the fs is mounted,
5476	* we want the xattr to not exist anymore (same behaviour as other filesystems
5477	* with a journal, ext3/4, xfs, f2fs, etc).
5478	*/
5479	static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5480	struct btrfs_inode *inode,
5481	struct btrfs_path *path,
5482	struct btrfs_path *dst_path,
5483	struct btrfs_log_ctx *ctx)
5484	{
5485	struct btrfs_root *root = inode->root;
5486	int ret;
5487	struct btrfs_key key;
5488	const u64 ino = btrfs_ino(inode);
5489	int ins_nr = `0`;
5490	int start_slot = `0`;
5491	bool found_xattrs = false;
5492
5493	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5494	return `0`;
5495
5496	key.objectid = ino;
5497	key.type = BTRFS_XATTR_ITEM_KEY;
5498	key.offset = `0`;
5499
5500	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
5501	if (ret < `0`)
5502	return ret;
5503
5504	while (true) {
5505	int slot = path->slots[`0`];
5506	struct extent_buffer *leaf = path->nodes[`0`];
5507	int nritems = btrfs_header_nritems(eb: leaf);
5508
5509	if (slot >= nritems) {
5510	if (ins_nr > `0`) {
5511	ret = copy_items(trans, inode, dst_path, src_path: path,
5512	start_slot, nr: ins_nr, inode_only: `1`, logged_isize: `0`, ctx);
5513	if (ret < `0`)
5514	return ret;
5515	ins_nr = `0`;
5516	}
5517	ret = btrfs_next_leaf(root, path);
5518	if (ret < `0`)
5519	return ret;
5520	else if (ret > `0`)
5521	break;
5522	continue;
5523	}
5524
5525	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
5526	if (key.objectid != ino \|\| key.type != BTRFS_XATTR_ITEM_KEY)
5527	break;
5528
5529	if (ins_nr == `0`)
5530	start_slot = slot;
5531	ins_nr++;
5532	path->slots[`0`]++;
5533	found_xattrs = true;
5534	cond_resched();
5535	}
5536	if (ins_nr > `0`) {
5537	ret = copy_items(trans, inode, dst_path, src_path: path,
5538	start_slot, nr: ins_nr, inode_only: `1`, logged_isize: `0`, ctx);
5539	if (ret < `0`)
5540	return ret;
5541	}
5542
5543	if (!found_xattrs)
5544	set_bit(nr: BTRFS_INODE_NO_XATTRS, addr: &inode->runtime_flags);
5545
5546	return `0`;
5547	}
5548
5549	/*
5550	* When using the NO_HOLES feature if we punched a hole that causes the
5551	* deletion of entire leafs or all the extent items of the first leaf (the one
5552	* that contains the inode item and references) we may end up not processing
5553	* any extents, because there are no leafs with a generation matching the
5554	* current transaction that have extent items for our inode. So we need to find
5555	* if any holes exist and then log them. We also need to log holes after any
5556	* truncate operation that changes the inode's size.
5557	*/
5558	static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5559	struct btrfs_inode *inode,
5560	struct btrfs_path *path)
5561	{
5562	struct btrfs_root *root = inode->root;
5563	struct btrfs_fs_info *fs_info = root->fs_info;
5564	struct btrfs_key key;
5565	const u64 ino = btrfs_ino(inode);
5566	const u64 i_size = i_size_read(inode: &inode->vfs_inode);
5567	u64 prev_extent_end = `0`;
5568	int ret;
5569
5570	if (!btrfs_fs_incompat(fs_info, NO_HOLES) \|\| i_size == `0`)
5571	return `0`;
5572
5573	key.objectid = ino;
5574	key.type = BTRFS_EXTENT_DATA_KEY;
5575	key.offset = `0`;
5576
5577	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
5578	if (ret < `0`)
5579	return ret;
5580
5581	while (true) {
5582	struct extent_buffer *leaf = path->nodes[`0`];
5583
5584	if (path->slots[`0`] >= btrfs_header_nritems(eb: path->nodes[`0`])) {
5585	ret = btrfs_next_leaf(root, path);
5586	if (ret < `0`)
5587	return ret;
5588	if (ret > `0`) {
5589	ret = `0`;
5590	break;
5591	}
5592	leaf = path->nodes[`0`];
5593	}
5594
5595	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
5596	if (key.objectid != ino \|\| key.type != BTRFS_EXTENT_DATA_KEY)
5597	break;
5598
5599	/ We have a hole, log it. /
5600	if (prev_extent_end < key.offset) {
5601	const u64 hole_len = key.offset - prev_extent_end;
5602
5603	/*
5604	* Release the path to avoid deadlocks with other code
5605	* paths that search the root while holding locks on
5606	* leafs from the log root.
5607	*/
5608	btrfs_release_path(p: path);
5609	ret = btrfs_insert_hole_extent(trans, root: root->log_root,
5610	objectid: ino, pos: prev_extent_end,
5611	num_bytes: hole_len);
5612	if (ret < `0`)
5613	return ret;
5614
5615	/*
5616	* Search for the same key again in the root. Since it's
5617	* an extent item and we are holding the inode lock, the
5618	* key must still exist. If it doesn't just emit warning
5619	* and return an error to fall back to a transaction
5620	* commit.
5621	*/
5622	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
5623	if (ret < `0`)
5624	return ret;
5625	if (WARN_ON(ret > `0`))
5626	return -ENOENT;
5627	leaf = path->nodes[`0`];
5628	}
5629
5630	prev_extent_end = btrfs_file_extent_end(path);
5631	path->slots[`0`]++;
5632	cond_resched();
5633	}
5634
5635	if (prev_extent_end < i_size) {
5636	u64 hole_len;
5637
5638	btrfs_release_path(p: path);
5639	hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5640	ret = btrfs_insert_hole_extent(trans, root: root->log_root, objectid: ino,
5641	pos: prev_extent_end, num_bytes: hole_len);
5642	if (ret < `0`)
5643	return ret;
5644	}
5645
5646	return `0`;
5647	}
5648
5649	/*
5650	* When we are logging a new inode X, check if it doesn't have a reference that
5651	* matches the reference from some other inode Y created in a past transaction
5652	* and that was renamed in the current transaction. If we don't do this, then at
5653	* log replay time we can lose inode Y (and all its files if it's a directory):
5654	*
5655	* mkdir /mnt/x
5656	* echo "hello world" > /mnt/x/foobar
5657	* sync
5658	* mv /mnt/x /mnt/y
5659	* mkdir /mnt/x # or touch /mnt/x
5660	* xfs_io -c fsync /mnt/x
5661	* <power fail>
5662	* mount fs, trigger log replay
5663	*
5664	* After the log replay procedure, we would lose the first directory and all its
5665	* files (file foobar).
5666	* For the case where inode Y is not a directory we simply end up losing it:
5667	*
5668	* echo "123" > /mnt/foo
5669	* sync
5670	* mv /mnt/foo /mnt/bar
5671	* echo "abc" > /mnt/foo
5672	* xfs_io -c fsync /mnt/foo
5673	* <power fail>
5674	*
5675	* We also need this for cases where a snapshot entry is replaced by some other
5676	* entry (file or directory) otherwise we end up with an unreplayable log due to
5677	* attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5678	* if it were a regular entry:
5679	*
5680	* mkdir /mnt/x
5681	* btrfs subvolume snapshot /mnt /mnt/x/snap
5682	* btrfs subvolume delete /mnt/x/snap
5683	* rmdir /mnt/x
5684	* mkdir /mnt/x
5685	* fsync /mnt/x or fsync some new file inside it
5686	* <power fail>
5687	*
5688	* The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5689	* the same transaction.
5690	*/
5691	static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5692	const int slot,
5693	const struct btrfs_key *key,
5694	struct btrfs_inode *inode,
5695	u64 other_ino, u64 other_parent)
5696	{
5697	BTRFS_PATH_AUTO_FREE(search_path);
5698	char AUTO_KFREE(name);
5699	u32 name_len = `0`;
5700	u32 item_size = btrfs_item_size(eb, slot);
5701	u32 cur_offset = `0`;
5702	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5703
5704	search_path = btrfs_alloc_path();
5705	if (!search_path)
5706	return -ENOMEM;
5707	search_path->search_commit_root = true;
5708	search_path->skip_locking = true;
5709
5710	while (cur_offset < item_size) {
5711	u64 parent;
5712	u32 this_name_len;
5713	u32 this_len;
5714	unsigned long name_ptr;
5715	struct btrfs_dir_item *di;
5716	struct fscrypt_str name_str;
5717
5718	if (key->type == BTRFS_INODE_REF_KEY) {
5719	struct btrfs_inode_ref *iref;
5720
5721	iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5722	parent = key->offset;
5723	this_name_len = btrfs_inode_ref_name_len(eb, s: iref);
5724	name_ptr = (unsigned long)(iref + `1`);
5725	this_len = sizeof(*iref) + this_name_len;
5726	} else {
5727	struct btrfs_inode_extref *extref;
5728
5729	extref = (struct btrfs_inode_extref *)(ptr +
5730	cur_offset);
5731	parent = btrfs_inode_extref_parent(eb, s: extref);
5732	this_name_len = btrfs_inode_extref_name_len(eb, s: extref);
5733	name_ptr = (unsigned long)&extref->name;
5734	this_len = sizeof(*extref) + this_name_len;
5735	}
5736
5737	if (this_name_len > name_len) {
5738	char *new_name;
5739
5740	new_name = krealloc(name, this_name_len, GFP_NOFS);
5741	if (!new_name)
5742	return -ENOMEM;
5743	name_len = this_name_len;
5744	name = new_name;
5745	}
5746
5747	read_extent_buffer(eb, dst: name, start: name_ptr, len: this_name_len);
5748
5749	name_str.name = name;
5750	name_str.len = this_name_len;
5751	di = btrfs_lookup_dir_item(NULL, root: inode->root, path: search_path,
5752	dir: parent, name: &name_str, mod: `0`);
5753	if (di && !IS_ERR(ptr: di)) {
5754	struct btrfs_key di_key;
5755
5756	btrfs_dir_item_key_to_cpu(eb: search_path->nodes[`0`],
5757	item: di, cpu_key: &di_key);
5758	if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5759	if (di_key.objectid != key->objectid) {
5760	*other_ino = di_key.objectid;
5761	*other_parent = parent;
5762	return `1`;
5763	} else {
5764	return `0`;
5765	}
5766	} else {
5767	return -EAGAIN;
5768	}
5769	} else if (IS_ERR(ptr: di)) {
5770	return PTR_ERR(ptr: di);
5771	}
5772	btrfs_release_path(p: search_path);
5773
5774	cur_offset += this_len;
5775	}
5776
5777	return `0`;
5778	}
5779
5780	/*
5781	* Check if we need to log an inode. This is used in contexts where while
5782	* logging an inode we need to log another inode (either that it exists or in
5783	* full mode). This is used instead of btrfs_inode_in_log() because the later
5784	* requires the inode to be in the log and have the log transaction committed,
5785	* while here we do not care if the log transaction was already committed - our
5786	* caller will commit the log later - and we want to avoid logging an inode
5787	* multiple times when multiple tasks have joined the same log transaction.
5788	*/
5789	static bool need_log_inode(const struct btrfs_trans_handle *trans,
5790	struct btrfs_inode *inode)
5791	{
5792	/*
5793	* If a directory was not modified, no dentries added or removed, we can
5794	* and should avoid logging it.
5795	*/
5796	if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5797	return false;
5798
5799	/*
5800	* If this inode does not have new/updated/deleted xattrs since the last
5801	* time it was logged and is flagged as logged in the current transaction,
5802	* we can skip logging it. As for new/deleted names, those are updated in
5803	* the log by link/unlink/rename operations.
5804	* In case the inode was logged and then evicted and reloaded, its
5805	* logged_trans will be 0, in which case we have to fully log it since
5806	* logged_trans is a transient field, not persisted.
5807	*/
5808	if (inode_logged(trans, inode, NULL) == `1` &&
5809	!test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5810	return false;
5811
5812	return true;
5813	}
5814
5815	struct btrfs_dir_list {
5816	u64 ino;
5817	struct list_head list;
5818	};
5819
5820	/*
5821	* Log the inodes of the new dentries of a directory.
5822	* See process_dir_items_leaf() for details about why it is needed.
5823	* This is a recursive operation - if an existing dentry corresponds to a
5824	* directory, that directory's new entries are logged too (same behaviour as
5825	* ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes
5826	* the dentries point to we do not acquire their VFS lock, otherwise lockdep
5827	* complains about the following circular lock dependency / possible deadlock:
5828	*
5829	* CPU0 CPU1
5830	* ---- ----
5831	* lock(&type->i_mutex_dir_key#3/2);
5832	* lock(sb_internal#2);
5833	* lock(&type->i_mutex_dir_key#3/2);
5834	* lock(&sb->s_type->i_mutex_key#14);
5835	*
5836	* Where sb_internal is the lock (a counter that works as a lock) acquired by
5837	* sb_start_intwrite() in btrfs_start_transaction().
5838	* Not acquiring the VFS lock of the inodes is still safe because:
5839	*
5840	* 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5841	* that while logging the inode new references (names) are added or removed
5842	* from the inode, leaving the logged inode item with a link count that does
5843	* not match the number of logged inode reference items. This is fine because
5844	* at log replay time we compute the real number of links and correct the
5845	* link count in the inode item (see replay_one_buffer() and
5846	* link_to_fixup_dir());
5847	*
5848	* 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5849	* while logging the inode's items new index items (key type
5850	* BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5851	* has a size that doesn't match the sum of the lengths of all the logged
5852	* names - this is ok, not a problem, because at log replay time we set the
5853	* directory's i_size to the correct value (see replay_one_name() and
5854	* overwrite_item()).
5855	*/
5856	static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5857	struct btrfs_inode *start_inode,
5858	struct btrfs_log_ctx *ctx)
5859	{
5860	struct btrfs_root *root = start_inode->root;
5861	struct btrfs_path *path;
5862	LIST_HEAD(dir_list);
5863	struct btrfs_dir_list *dir_elem;
5864	u64 ino = btrfs_ino(inode: start_inode);
5865	struct btrfs_inode *curr_inode = start_inode;
5866	int ret = `0`;
5867
5868	path = btrfs_alloc_path();
5869	if (!path)
5870	return -ENOMEM;
5871
5872	/ Pairs with btrfs_add_delayed_iput below. /
5873	ihold(inode: &curr_inode->vfs_inode);
5874
5875	while (true) {
5876	struct btrfs_key key;
5877	struct btrfs_key found_key;
5878	u64 next_index;
5879	bool continue_curr_inode = true;
5880	int iter_ret;
5881
5882	key.objectid = ino;
5883	key.type = BTRFS_DIR_INDEX_KEY;
5884	key.offset = btrfs_get_first_dir_index_to_log(inode: curr_inode);
5885	next_index = key.offset;
5886	again:
5887	btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5888	struct extent_buffer *leaf = path->nodes[`0`];
5889	struct btrfs_dir_item *di;
5890	struct btrfs_key di_key;
5891	struct btrfs_inode *di_inode;
5892	int log_mode = LOG_INODE_EXISTS;
5893	int type;
5894
5895	if (found_key.objectid != ino \|\|
5896	found_key.type != BTRFS_DIR_INDEX_KEY) {
5897	continue_curr_inode = false;
5898	break;
5899	}
5900
5901	next_index = found_key.offset + `1`;
5902
5903	di = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_dir_item);
5904	type = btrfs_dir_ftype(eb: leaf, item: di);
5905	if (btrfs_dir_transid(eb: leaf, s: di) < trans->transid)
5906	continue;
5907	btrfs_dir_item_key_to_cpu(eb: leaf, item: di, cpu_key: &di_key);
5908	if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5909	continue;
5910
5911	btrfs_release_path(p: path);
5912	di_inode = btrfs_iget_logging(objectid: di_key.objectid, root);
5913	if (IS_ERR(ptr: di_inode)) {
5914	ret = PTR_ERR(ptr: di_inode);
5915	goto out;
5916	}
5917
5918	if (!need_log_inode(trans, inode: di_inode)) {
5919	btrfs_add_delayed_iput(inode: di_inode);
5920	break;
5921	}
5922
5923	ctx->log_new_dentries = false;
5924	if (type == BTRFS_FT_DIR)
5925	log_mode = LOG_INODE_ALL;
5926	ret = btrfs_log_inode(trans, inode: di_inode, inode_only: log_mode, ctx);
5927	btrfs_add_delayed_iput(inode: di_inode);
5928	if (ret)
5929	goto out;
5930	if (ctx->log_new_dentries) {
5931	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5932	if (!dir_elem) {
5933	ret = -ENOMEM;
5934	goto out;
5935	}
5936	dir_elem->ino = di_key.objectid;
5937	list_add_tail(new: &dir_elem->list, head: &dir_list);
5938	}
5939	break;
5940	}
5941
5942	btrfs_release_path(p: path);
5943
5944	if (iter_ret < `0`) {
5945	ret = iter_ret;
5946	goto out;
5947	} else if (iter_ret > `0`) {
5948	continue_curr_inode = false;
5949	} else {
5950	key = found_key;
5951	}
5952
5953	if (continue_curr_inode && key.offset < (u64)-`1`) {
5954	key.offset++;
5955	goto again;
5956	}
5957
5958	btrfs_set_first_dir_index_to_log(inode: curr_inode, index: next_index);
5959
5960	if (list_empty(head: &dir_list))
5961	break;
5962
5963	dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5964	ino = dir_elem->ino;
5965	list_del(entry: &dir_elem->list);
5966	kfree(objp: dir_elem);
5967
5968	btrfs_add_delayed_iput(inode: curr_inode);
5969
5970	curr_inode = btrfs_iget_logging(objectid: ino, root);
5971	if (IS_ERR(ptr: curr_inode)) {
5972	ret = PTR_ERR(ptr: curr_inode);
5973	curr_inode = NULL;
5974	break;
5975	}
5976	}
5977	out:
5978	btrfs_free_path(p: path);
5979	if (curr_inode)
5980	btrfs_add_delayed_iput(inode: curr_inode);
5981
5982	if (ret) {
5983	struct btrfs_dir_list *next;
5984
5985	list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5986	kfree(objp: dir_elem);
5987	}
5988
5989	return ret;
5990	}
5991
5992	struct btrfs_ino_list {
5993	u64 ino;
5994	u64 parent;
5995	struct list_head list;
5996	};
5997
5998	static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5999	{
6000	struct btrfs_ino_list *curr;
6001	struct btrfs_ino_list *next;
6002
6003	list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
6004	list_del(entry: &curr->list);
6005	kfree(objp: curr);
6006	}
6007	}
6008
6009	static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
6010	struct btrfs_path *path)
6011	{
6012	struct btrfs_key key;
6013	int ret;
6014
6015	key.objectid = ino;
6016	key.type = BTRFS_INODE_ITEM_KEY;
6017	key.offset = `0`;
6018
6019	path->search_commit_root = true;
6020	path->skip_locking = true;
6021
6022	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
6023	if (WARN_ON_ONCE(ret > `0`)) {
6024	/*
6025	* We have previously found the inode through the commit root
6026	* so this should not happen. If it does, just error out and
6027	* fallback to a transaction commit.
6028	*/
6029	ret = -ENOENT;
6030	} else if (ret == `0`) {
6031	struct btrfs_inode_item *item;
6032
6033	item = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6034	struct btrfs_inode_item);
6035	if (S_ISDIR(btrfs_inode_mode(path->nodes[`0`], item)))
6036	ret = `1`;
6037	}
6038
6039	btrfs_release_path(p: path);
6040	path->search_commit_root = false;
6041	path->skip_locking = false;
6042
6043	return ret;
6044	}
6045
6046	static bool can_log_conflicting_inode(const struct btrfs_trans_handle *trans,
6047	const struct btrfs_inode *inode)
6048	{
6049	if (!S_ISDIR(inode->vfs_inode.i_mode))
6050	return true;
6051
6052	if (inode->last_unlink_trans < trans->transid)
6053	return true;
6054
6055	/*
6056	* If this is a directory and its unlink_trans is not from a past
6057	* transaction then we must fallback to a transaction commit in order
6058	* to avoid getting a directory with 2 hard links after log replay.
6059	*
6060	* This happens if a directory A is renamed, moved from one parent
6061	* directory to another one, a new file is created in the old parent
6062	* directory with the old name of our directory A, the new file is
6063	* fsynced, then we moved the new file to some other parent directory
6064	* and fsync again the new file. This results in a log tree where we
6065	* logged that directory A existed, with the INODE_REF item for the
6066	* new location but without having logged its old parent inode, so
6067	* that on log replay we add a new link for the new location but the
6068	* old link remains, resulting in a link count of 2.
6069	*/
6070	return false;
6071	}
6072
6073	static int add_conflicting_inode(struct btrfs_trans_handle *trans,
6074	struct btrfs_root *root,
6075	struct btrfs_path *path,
6076	u64 ino, u64 parent,
6077	struct btrfs_log_ctx *ctx)
6078	{
6079	struct btrfs_ino_list *ino_elem;
6080	struct btrfs_inode *inode;
6081
6082	/*
6083	* It's rare to have a lot of conflicting inodes, in practice it is not
6084	* common to have more than 1 or 2. We don't want to collect too many,
6085	* as we could end up logging too many inodes (even if only in
6086	* LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
6087	* commits.
6088	*/
6089	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
6090	return BTRFS_LOG_FORCE_COMMIT;
6091
6092	inode = btrfs_iget_logging(objectid: ino, root);
6093	/*
6094	* If the other inode that had a conflicting dir entry was deleted in
6095	* the current transaction then we either:
6096	*
6097	* 1) Log the parent directory (later after adding it to the list) if
6098	* the inode is a directory. This is because it may be a deleted
6099	* subvolume/snapshot or it may be a regular directory that had
6100	* deleted subvolumes/snapshots (or subdirectories that had them),
6101	* and at the moment we can't deal with dropping subvolumes/snapshots
6102	* during log replay. So we just log the parent, which will result in
6103	* a fallback to a transaction commit if we are dealing with those
6104	* cases (last_unlink_trans will match the current transaction);
6105	*
6106	* 2) Do nothing if it's not a directory. During log replay we simply
6107	* unlink the conflicting dentry from the parent directory and then
6108	* add the dentry for our inode. Like this we can avoid logging the
6109	* parent directory (and maybe fallback to a transaction commit in
6110	* case it has a last_unlink_trans == trans->transid, due to moving
6111	* some inode from it to some other directory).
6112	*/
6113	if (IS_ERR(ptr: inode)) {
6114	int ret = PTR_ERR(ptr: inode);
6115
6116	if (ret != -ENOENT)
6117	return ret;
6118
6119	ret = conflicting_inode_is_dir(root, ino, path);
6120	/ Not a directory or we got an error. /
6121	if (ret <= `0`)
6122	return ret;
6123
6124	/ Conflicting inode is a directory, so we'll log its parent. /
6125	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
6126	if (!ino_elem)
6127	return -ENOMEM;
6128	ino_elem->ino = ino;
6129	ino_elem->parent = parent;
6130	list_add_tail(new: &ino_elem->list, head: &ctx->conflict_inodes);
6131	ctx->num_conflict_inodes++;
6132
6133	return `0`;
6134	}
6135
6136	/*
6137	* If the inode was already logged skip it - otherwise we can hit an
6138	* infinite loop. Example:
6139	*
6140	* From the commit root (previous transaction) we have the following
6141	* inodes:
6142	*
6143	* inode 257 a directory
6144	* inode 258 with references "zz" and "zz_link" on inode 257
6145	* inode 259 with reference "a" on inode 257
6146	*
6147	* And in the current (uncommitted) transaction we have:
6148	*
6149	* inode 257 a directory, unchanged
6150	* inode 258 with references "a" and "a2" on inode 257
6151	* inode 259 with reference "zz_link" on inode 257
6152	* inode 261 with reference "zz" on inode 257
6153	*
6154	* When logging inode 261 the following infinite loop could
6155	* happen if we don't skip already logged inodes:
6156	*
6157	* - we detect inode 258 as a conflicting inode, with inode 261
6158	* on reference "zz", and log it;
6159	*
6160	* - we detect inode 259 as a conflicting inode, with inode 258
6161	* on reference "a", and log it;
6162	*
6163	* - we detect inode 258 as a conflicting inode, with inode 259
6164	* on reference "zz_link", and log it - again! After this we
6165	* repeat the above steps forever.
6166	*
6167	* Here we can use need_log_inode() because we only need to log the
6168	* inode in LOG_INODE_EXISTS mode and rename operations update the log,
6169	* so that the log ends up with the new name and without the old name.
6170	*/
6171	if (!need_log_inode(trans, inode)) {
6172	btrfs_add_delayed_iput(inode);
6173	return `0`;
6174	}
6175
6176	if (!can_log_conflicting_inode(trans, inode)) {
6177	btrfs_add_delayed_iput(inode);
6178	return BTRFS_LOG_FORCE_COMMIT;
6179	}
6180
6181	btrfs_add_delayed_iput(inode);
6182
6183	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
6184	if (!ino_elem)
6185	return -ENOMEM;
6186	ino_elem->ino = ino;
6187	ino_elem->parent = parent;
6188	list_add_tail(new: &ino_elem->list, head: &ctx->conflict_inodes);
6189	ctx->num_conflict_inodes++;
6190
6191	return `0`;
6192	}
6193
6194	static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
6195	struct btrfs_root *root,
6196	struct btrfs_log_ctx *ctx)
6197	{
6198	int ret = `0`;
6199
6200	/*
6201	* Conflicting inodes are logged by the first call to btrfs_log_inode(),
6202	* otherwise we could have unbounded recursion of btrfs_log_inode()
6203	* calls. This check guarantees we can have only 1 level of recursion.
6204	*/
6205	if (ctx->logging_conflict_inodes)
6206	return `0`;
6207
6208	ctx->logging_conflict_inodes = true;
6209
6210	/*
6211	* New conflicting inodes may be found and added to the list while we
6212	* are logging a conflicting inode, so keep iterating while the list is
6213	* not empty.
6214	*/
6215	while (!list_empty(head: &ctx->conflict_inodes)) {
6216	struct btrfs_ino_list *curr;
6217	struct btrfs_inode *inode;
6218	u64 ino;
6219	u64 parent;
6220
6221	curr = list_first_entry(&ctx->conflict_inodes,
6222	struct btrfs_ino_list, list);
6223	ino = curr->ino;
6224	parent = curr->parent;
6225	list_del(entry: &curr->list);
6226	kfree(objp: curr);
6227
6228	inode = btrfs_iget_logging(objectid: ino, root);
6229	/*
6230	* If the other inode that had a conflicting dir entry was
6231	* deleted in the current transaction, we need to log its parent
6232	* directory. See the comment at add_conflicting_inode().
6233	*/
6234	if (IS_ERR(ptr: inode)) {
6235	ret = PTR_ERR(ptr: inode);
6236	if (ret != -ENOENT)
6237	break;
6238
6239	inode = btrfs_iget_logging(objectid: parent, root);
6240	if (IS_ERR(ptr: inode)) {
6241	ret = PTR_ERR(ptr: inode);
6242	break;
6243	}
6244
6245	if (!can_log_conflicting_inode(trans, inode)) {
6246	btrfs_add_delayed_iput(inode);
6247	ret = BTRFS_LOG_FORCE_COMMIT;
6248	break;
6249	}
6250
6251	/*
6252	* Always log the directory, we cannot make this
6253	* conditional on need_log_inode() because the directory
6254	* might have been logged in LOG_INODE_EXISTS mode or
6255	* the dir index of the conflicting inode is not in a
6256	* dir index key range logged for the directory. So we
6257	* must make sure the deletion is recorded.
6258	*/
6259	ret = btrfs_log_inode(trans, inode, inode_only: LOG_INODE_ALL, ctx);
6260	btrfs_add_delayed_iput(inode);
6261	if (ret)
6262	break;
6263	continue;
6264	}
6265
6266	/*
6267	* Here we can use need_log_inode() because we only need to log
6268	* the inode in LOG_INODE_EXISTS mode and rename operations
6269	* update the log, so that the log ends up with the new name and
6270	* without the old name.
6271	*
6272	* We did this check at add_conflicting_inode(), but here we do
6273	* it again because if some other task logged the inode after
6274	* that, we can avoid doing it again.
6275	*/
6276	if (!need_log_inode(trans, inode)) {
6277	btrfs_add_delayed_iput(inode);
6278	continue;
6279	}
6280
6281	/*
6282	* We are safe logging the other inode without acquiring its
6283	* lock as long as we log with the LOG_INODE_EXISTS mode. We
6284	* are safe against concurrent renames of the other inode as
6285	* well because during a rename we pin the log and update the
6286	* log with the new name before we unpin it.
6287	*/
6288	ret = btrfs_log_inode(trans, inode, inode_only: LOG_INODE_EXISTS, ctx);
6289	btrfs_add_delayed_iput(inode);
6290	if (ret)
6291	break;
6292	}
6293
6294	ctx->logging_conflict_inodes = false;
6295	if (ret)
6296	free_conflicting_inodes(ctx);
6297
6298	return ret;
6299	}
6300
6301	static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
6302	struct btrfs_inode *inode,
6303	struct btrfs_key *min_key,
6304	const struct btrfs_key *max_key,
6305	struct btrfs_path *path,
6306	struct btrfs_path *dst_path,
6307	const u64 logged_isize,
6308	const int inode_only,
6309	struct btrfs_log_ctx *ctx,
6310	bool *need_log_inode_item)
6311	{
6312	const u64 i_size = i_size_read(inode: &inode->vfs_inode);
6313	struct btrfs_root *root = inode->root;
6314	int ins_start_slot = `0`;
6315	int ins_nr = `0`;
6316	int ret;
6317
6318	while (`1`) {
6319	ret = btrfs_search_forward(root, min_key, path, min_trans: trans->transid);
6320	if (ret < `0`)
6321	return ret;
6322	if (ret > `0`) {
6323	ret = `0`;
6324	break;
6325	}
6326	again:
6327	/ Note, ins_nr might be > 0 here, cleanup outside the loop /
6328	if (min_key->objectid != max_key->objectid)
6329	break;
6330	if (min_key->type > max_key->type)
6331	break;
6332
6333	if (min_key->type == BTRFS_INODE_ITEM_KEY) {
6334	*need_log_inode_item = false;
6335	} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
6336	min_key->offset >= i_size) {
6337	/*
6338	* Extents at and beyond eof are logged with
6339	* btrfs_log_prealloc_extents().
6340	* Only regular files have BTRFS_EXTENT_DATA_KEY keys,
6341	* and no keys greater than that, so bail out.
6342	*/
6343	break;
6344	} else if (min_key->type == BTRFS_INODE_REF_KEY \|\|
6345	min_key->type == BTRFS_INODE_EXTREF_KEY) {
6346	u64 other_ino = `0`;
6347	u64 other_parent = `0`;
6348
6349	ret = btrfs_check_ref_name_override(eb: path->nodes[`0`],
6350	slot: path->slots[`0`], key: min_key, inode,
6351	other_ino: &other_ino, other_parent: &other_parent);
6352	if (ret < `0`) {
6353	return ret;
6354	} else if (ret > `0` &&
6355	other_ino != btrfs_ino(inode: ctx->inode)) {
6356	if (ins_nr > `0`) {
6357	ins_nr++;
6358	} else {
6359	ins_nr = `1`;
6360	ins_start_slot = path->slots[`0`];
6361	}
6362	ret = copy_items(trans, inode, dst_path, src_path: path,
6363	start_slot: ins_start_slot, nr: ins_nr,
6364	inode_only, logged_isize, ctx);
6365	if (ret < `0`)
6366	return ret;
6367	ins_nr = `0`;
6368
6369	btrfs_release_path(p: path);
6370	ret = add_conflicting_inode(trans, root, path,
6371	ino: other_ino,
6372	parent: other_parent, ctx);
6373	if (ret)
6374	return ret;
6375	goto next_key;
6376	}
6377	} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
6378	/ Skip xattrs, logged later with btrfs_log_all_xattrs() /
6379	if (ins_nr == `0`)
6380	goto next_slot;
6381	ret = copy_items(trans, inode, dst_path, src_path: path,
6382	start_slot: ins_start_slot,
6383	nr: ins_nr, inode_only, logged_isize, ctx);
6384	if (ret < `0`)
6385	return ret;
6386	ins_nr = `0`;
6387	goto next_slot;
6388	}
6389
6390	if (ins_nr && ins_start_slot + ins_nr == path->slots[`0`]) {
6391	ins_nr++;
6392	goto next_slot;
6393	} else if (!ins_nr) {
6394	ins_start_slot = path->slots[`0`];
6395	ins_nr = `1`;
6396	goto next_slot;
6397	}
6398
6399	ret = copy_items(trans, inode, dst_path, src_path: path, start_slot: ins_start_slot,
6400	nr: ins_nr, inode_only, logged_isize, ctx);
6401	if (ret < `0`)
6402	return ret;
6403	ins_nr = `1`;
6404	ins_start_slot = path->slots[`0`];
6405	next_slot:
6406	path->slots[`0`]++;
6407	if (path->slots[`0`] < btrfs_header_nritems(eb: path->nodes[`0`])) {
6408	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: min_key,
6409	nr: path->slots[`0`]);
6410	goto again;
6411	}
6412	if (ins_nr) {
6413	ret = copy_items(trans, inode, dst_path, src_path: path,
6414	start_slot: ins_start_slot, nr: ins_nr, inode_only,
6415	logged_isize, ctx);
6416	if (ret < `0`)
6417	return ret;
6418	ins_nr = `0`;
6419	}
6420	btrfs_release_path(p: path);
6421	next_key:
6422	if (min_key->offset < (u64)-`1`) {
6423	min_key->offset++;
6424	} else if (min_key->type < max_key->type) {
6425	min_key->type++;
6426	min_key->offset = `0`;
6427	} else {
6428	break;
6429	}
6430
6431	/*
6432	* We may process many leaves full of items for our inode, so
6433	* avoid monopolizing a cpu for too long by rescheduling while
6434	* not holding locks on any tree.
6435	*/
6436	cond_resched();
6437	}
6438	if (ins_nr) {
6439	ret = copy_items(trans, inode, dst_path, src_path: path, start_slot: ins_start_slot,
6440	nr: ins_nr, inode_only, logged_isize, ctx);
6441	if (ret)
6442	return ret;
6443	}
6444
6445	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
6446	/*
6447	* Release the path because otherwise we might attempt to double
6448	* lock the same leaf with btrfs_log_prealloc_extents() below.
6449	*/
6450	btrfs_release_path(p: path);
6451	ret = btrfs_log_prealloc_extents(trans, inode, path: dst_path, ctx);
6452	}
6453
6454	return ret;
6455	}
6456
6457	static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
6458	struct btrfs_root *log,
6459	struct btrfs_path *path,
6460	const struct btrfs_item_batch *batch,
6461	const struct btrfs_delayed_item *first_item)
6462	{
6463	const struct btrfs_delayed_item *curr = first_item;
6464	int ret;
6465
6466	ret = btrfs_insert_empty_items(trans, root: log, path, batch);
6467	if (ret)
6468	return ret;
6469
6470	for (int i = `0`; i < batch->nr; i++) {
6471	char *data_ptr;
6472
6473	data_ptr = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`], char);
6474	write_extent_buffer(eb: path->nodes[`0`], src: &curr->data,
6475	start: (unsigned long)data_ptr, len: curr->data_len);
6476	curr = list_next_entry(curr, log_list);
6477	path->slots[`0`]++;
6478	}
6479
6480	btrfs_release_path(p: path);
6481
6482	return `0`;
6483	}
6484
6485	static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6486	struct btrfs_inode *inode,
6487	struct btrfs_path *path,
6488	const struct list_head *delayed_ins_list,
6489	struct btrfs_log_ctx *ctx)
6490	{
6491	/ 195 (4095 bytes of keys and sizes) fits in a single 4K page. /
6492	const int max_batch_size = `195`;
6493	const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(info: trans->fs_info);
6494	const u64 ino = btrfs_ino(inode);
6495	struct btrfs_root *log = inode->root->log_root;
6496	struct btrfs_item_batch batch = {
6497	.nr = `0`,
6498	.total_data_size = `0`,
6499	};
6500	const struct btrfs_delayed_item *first = NULL;
6501	const struct btrfs_delayed_item *curr;
6502	char *ins_data;
6503	struct btrfs_key *ins_keys;
6504	u32 *ins_sizes;
6505	u64 curr_batch_size = `0`;
6506	int batch_idx = `0`;
6507	int ret;
6508
6509	/ We are adding dir index items to the log tree. /
6510	lockdep_assert_held(&inode->log_mutex);
6511
6512	/*
6513	* We collect delayed items before copying index keys from the subvolume
6514	* to the log tree. However just after we collected them, they may have
6515	* been flushed (all of them or just some of them), and therefore we
6516	* could have copied them from the subvolume tree to the log tree.
6517	* So find the first delayed item that was not yet logged (they are
6518	* sorted by index number).
6519	*/
6520	list_for_each_entry(curr, delayed_ins_list, log_list) {
6521	if (curr->index > inode->last_dir_index_offset) {
6522	first = curr;
6523	break;
6524	}
6525	}
6526
6527	/ Empty list or all delayed items were already logged. /
6528	if (!first)
6529	return `0`;
6530
6531	ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
6532	if (!ins_data)
6533	return -ENOMEM;
6534	ins_sizes = (u32 *)ins_data;
6535	batch.data_sizes = ins_sizes;
6536	ins_keys = (struct btrfs_key )(ins_data + max_batch_size sizeof(u32));
6537	batch.keys = ins_keys;
6538
6539	curr = first;
6540	while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6541	const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6542
6543	if (curr_batch_size + curr_size > leaf_data_size \|\|
6544	batch.nr == max_batch_size) {
6545	ret = insert_delayed_items_batch(trans, log, path,
6546	batch: &batch, first_item: first);
6547	if (ret)
6548	goto out;
6549	batch_idx = `0`;
6550	batch.nr = `0`;
6551	batch.total_data_size = `0`;
6552	curr_batch_size = `0`;
6553	first = curr;
6554	}
6555
6556	ins_sizes[batch_idx] = curr->data_len;
6557	ins_keys[batch_idx].objectid = ino;
6558	ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6559	ins_keys[batch_idx].offset = curr->index;
6560	curr_batch_size += curr_size;
6561	batch.total_data_size += curr->data_len;
6562	batch.nr++;
6563	batch_idx++;
6564	curr = list_next_entry(curr, log_list);
6565	}
6566
6567	ASSERT(batch.nr >= `1`, "batch.nr=%d", batch.nr);
6568	ret = insert_delayed_items_batch(trans, log, path, batch: &batch, first_item: first);
6569
6570	curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6571	log_list);
6572	inode->last_dir_index_offset = curr->index;
6573	out:
6574	kfree(objp: ins_data);
6575
6576	return ret;
6577	}
6578
6579	static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6580	struct btrfs_inode *inode,
6581	struct btrfs_path *path,
6582	const struct list_head *delayed_del_list,
6583	struct btrfs_log_ctx *ctx)
6584	{
6585	const u64 ino = btrfs_ino(inode);
6586	const struct btrfs_delayed_item *curr;
6587
6588	curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6589	log_list);
6590
6591	while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6592	u64 first_dir_index = curr->index;
6593	u64 last_dir_index;
6594	const struct btrfs_delayed_item *next;
6595	int ret;
6596
6597	/*
6598	* Find a range of consecutive dir index items to delete. Like
6599	* this we log a single dir range item spanning several contiguous
6600	* dir items instead of logging one range item per dir index item.
6601	*/
6602	next = list_next_entry(curr, log_list);
6603	while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6604	if (next->index != curr->index + `1`)
6605	break;
6606	curr = next;
6607	next = list_next_entry(next, log_list);
6608	}
6609
6610	last_dir_index = curr->index;
6611	ASSERT(last_dir_index >= first_dir_index,
6612	"last_dir_index=%llu first_dir_index=%llu",
6613	last_dir_index, first_dir_index);
6614
6615	ret = insert_dir_log_key(trans, log: inode->root->log_root, path,
6616	dirid: ino, first_offset: first_dir_index, last_offset: last_dir_index);
6617	if (ret)
6618	return ret;
6619	curr = list_next_entry(curr, log_list);
6620	}
6621
6622	return `0`;
6623	}
6624
6625	static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6626	struct btrfs_inode *inode,
6627	struct btrfs_path *path,
6628	const struct list_head *delayed_del_list,
6629	const struct btrfs_delayed_item *first,
6630	const struct btrfs_delayed_item **last_ret)
6631	{
6632	const struct btrfs_delayed_item *next;
6633	struct extent_buffer *leaf = path->nodes[`0`];
6634	const int last_slot = btrfs_header_nritems(eb: leaf) - `1`;
6635	int slot = path->slots[`0`] + `1`;
6636	const u64 ino = btrfs_ino(inode);
6637
6638	next = list_next_entry(first, log_list);
6639
6640	while (slot < last_slot &&
6641	!list_entry_is_head(next, delayed_del_list, log_list)) {
6642	struct btrfs_key key;
6643
6644	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
6645	if (key.objectid != ino \|\|
6646	key.type != BTRFS_DIR_INDEX_KEY \|\|
6647	key.offset != next->index)
6648	break;
6649
6650	slot++;
6651	*last_ret = next;
6652	next = list_next_entry(next, log_list);
6653	}
6654
6655	return btrfs_del_items(trans, root: inode->root->log_root, path,
6656	slot: path->slots[`0`], nr: slot - path->slots[`0`]);
6657	}
6658
6659	static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6660	struct btrfs_inode *inode,
6661	struct btrfs_path *path,
6662	const struct list_head *delayed_del_list,
6663	struct btrfs_log_ctx *ctx)
6664	{
6665	struct btrfs_root *log = inode->root->log_root;
6666	const struct btrfs_delayed_item *curr;
6667	u64 last_range_start = `0`;
6668	u64 last_range_end = `0`;
6669	struct btrfs_key key;
6670
6671	key.objectid = btrfs_ino(inode);
6672	key.type = BTRFS_DIR_INDEX_KEY;
6673	curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6674	log_list);
6675
6676	while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6677	const struct btrfs_delayed_item *last = curr;
6678	u64 first_dir_index = curr->index;
6679	u64 last_dir_index;
6680	bool deleted_items = false;
6681	int ret;
6682
6683	key.offset = curr->index;
6684	ret = btrfs_search_slot(trans, root: log, key: &key, p: path, ins_len: -`1`, cow: `1`);
6685	if (ret < `0`) {
6686	return ret;
6687	} else if (ret == `0`) {
6688	ret = batch_delete_dir_index_items(trans, inode, path,
6689	delayed_del_list, first: curr,
6690	last_ret: &last);
6691	if (ret)
6692	return ret;
6693	deleted_items = true;
6694	}
6695
6696	btrfs_release_path(p: path);
6697
6698	/*
6699	* If we deleted items from the leaf, it means we have a range
6700	* item logging their range, so no need to add one or update an
6701	* existing one. Otherwise we have to log a dir range item.
6702	*/
6703	if (deleted_items)
6704	goto next_batch;
6705
6706	last_dir_index = last->index;
6707	ASSERT(last_dir_index >= first_dir_index,
6708	"last_dir_index=%llu first_dir_index=%llu",
6709	last_dir_index, first_dir_index);
6710	/*
6711	* If this range starts right after where the previous one ends,
6712	* then we want to reuse the previous range item and change its
6713	* end offset to the end of this range. This is just to minimize
6714	* leaf space usage, by avoiding adding a new range item.
6715	*/
6716	if (last_range_end != `0` && first_dir_index == last_range_end + `1`)
6717	first_dir_index = last_range_start;
6718
6719	ret = insert_dir_log_key(trans, log, path, dirid: key.objectid,
6720	first_offset: first_dir_index, last_offset: last_dir_index);
6721	if (ret)
6722	return ret;
6723
6724	last_range_start = first_dir_index;
6725	last_range_end = last_dir_index;
6726	next_batch:
6727	curr = list_next_entry(last, log_list);
6728	}
6729
6730	return `0`;
6731	}
6732
6733	static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6734	struct btrfs_inode *inode,
6735	struct btrfs_path *path,
6736	const struct list_head *delayed_del_list,
6737	struct btrfs_log_ctx *ctx)
6738	{
6739	/*
6740	* We are deleting dir index items from the log tree or adding range
6741	* items to it.
6742	*/
6743	lockdep_assert_held(&inode->log_mutex);
6744
6745	if (list_empty(head: delayed_del_list))
6746	return `0`;
6747
6748	if (ctx->logged_before)
6749	return log_delayed_deletions_incremental(trans, inode, path,
6750	delayed_del_list, ctx);
6751
6752	return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6753	ctx);
6754	}
6755
6756	/*
6757	* Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6758	* items instead of the subvolume tree.
6759	*/
6760	static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6761	struct btrfs_inode *inode,
6762	const struct list_head *delayed_ins_list,
6763	struct btrfs_log_ctx *ctx)
6764	{
6765	const bool orig_log_new_dentries = ctx->log_new_dentries;
6766	struct btrfs_delayed_item *item;
6767	int ret = `0`;
6768
6769	/*
6770	* No need for the log mutex, plus to avoid potential deadlocks or
6771	* lockdep annotations due to nesting of delayed inode mutexes and log
6772	* mutexes.
6773	*/
6774	lockdep_assert_not_held(&inode->log_mutex);
6775
6776	ASSERT(!ctx->logging_new_delayed_dentries,
6777	"ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries);
6778	ctx->logging_new_delayed_dentries = true;
6779
6780	list_for_each_entry(item, delayed_ins_list, log_list) {
6781	struct btrfs_dir_item *dir_item;
6782	struct btrfs_inode *di_inode;
6783	struct btrfs_key key;
6784	int log_mode = LOG_INODE_EXISTS;
6785
6786	dir_item = (struct btrfs_dir_item *)item->data;
6787	btrfs_disk_key_to_cpu(cpu_key: &key, disk_key: &dir_item->location);
6788
6789	if (key.type == BTRFS_ROOT_ITEM_KEY)
6790	continue;
6791
6792	di_inode = btrfs_iget_logging(objectid: key.objectid, root: inode->root);
6793	if (IS_ERR(ptr: di_inode)) {
6794	ret = PTR_ERR(ptr: di_inode);
6795	break;
6796	}
6797
6798	if (!need_log_inode(trans, inode: di_inode)) {
6799	btrfs_add_delayed_iput(inode: di_inode);
6800	continue;
6801	}
6802
6803	if (btrfs_stack_dir_ftype(item: dir_item) == BTRFS_FT_DIR)
6804	log_mode = LOG_INODE_ALL;
6805
6806	ctx->log_new_dentries = false;
6807	ret = btrfs_log_inode(trans, inode: di_inode, inode_only: log_mode, ctx);
6808
6809	if (!ret && ctx->log_new_dentries)
6810	ret = log_new_dir_dentries(trans, start_inode: di_inode, ctx);
6811
6812	btrfs_add_delayed_iput(inode: di_inode);
6813
6814	if (ret)
6815	break;
6816	}
6817
6818	ctx->log_new_dentries = orig_log_new_dentries;
6819	ctx->logging_new_delayed_dentries = false;
6820
6821	return ret;
6822	}
6823
6824	/ log a single inode in the tree log.*
6825	* At least one parent directory for this inode must exist in the tree
6826	* or be logged already.
6827	*
6828	* Any items from this inode changed by the current transaction are copied
6829	* to the log tree. An extra reference is taken on any extents in this
6830	* file, allowing us to avoid a whole pile of corner cases around logging
6831	* blocks that have been removed from the tree.
6832	*
6833	* See LOG_INODE_ALL and related defines for a description of what inode_only
6834	* does.
6835	*
6836	* This handles both files and directories.
6837	*/
6838	static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6839	struct btrfs_inode *inode,
6840	int inode_only,
6841	struct btrfs_log_ctx *ctx)
6842	{
6843	struct btrfs_path *path;
6844	struct btrfs_path *dst_path;
6845	struct btrfs_key min_key;
6846	struct btrfs_key max_key;
6847	struct btrfs_root *log = inode->root->log_root;
6848	int ret;
6849	bool fast_search = false;
6850	u64 ino = btrfs_ino(inode);
6851	struct extent_map_tree *em_tree = &inode->extent_tree;
6852	u64 logged_isize = `0`;
6853	bool need_log_inode_item = true;
6854	bool xattrs_logged = false;
6855	bool inode_item_dropped = true;
6856	bool full_dir_logging = false;
6857	LIST_HEAD(delayed_ins_list);
6858	LIST_HEAD(delayed_del_list);
6859
6860	path = btrfs_alloc_path();
6861	if (!path)
6862	return -ENOMEM;
6863	dst_path = btrfs_alloc_path();
6864	if (!dst_path) {
6865	btrfs_free_path(p: path);
6866	return -ENOMEM;
6867	}
6868
6869	min_key.objectid = ino;
6870	min_key.type = BTRFS_INODE_ITEM_KEY;
6871	min_key.offset = `0`;
6872
6873	max_key.objectid = ino;
6874
6875
6876	/ today the code can only do partial logging of directories /
6877	if (S_ISDIR(inode->vfs_inode.i_mode) \|\|
6878	(!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6879	&inode->runtime_flags) &&
6880	inode_only >= LOG_INODE_EXISTS))
6881	max_key.type = BTRFS_XATTR_ITEM_KEY;
6882	else
6883	max_key.type = (u8)-`1`;
6884	max_key.offset = (u64)-`1`;
6885
6886	if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6887	full_dir_logging = true;
6888
6889	/*
6890	* If we are logging a directory while we are logging dentries of the
6891	* delayed items of some other inode, then we need to flush the delayed
6892	* items of this directory and not log the delayed items directly. This
6893	* is to prevent more than one level of recursion into btrfs_log_inode()
6894	* by having something like this:
6895	*
6896	* $ mkdir -p a/b/c/d/e/f/g/h/...
6897	* $ xfs_io -c "fsync" a
6898	*
6899	* Where all directories in the path did not exist before and are
6900	* created in the current transaction.
6901	* So in such a case we directly log the delayed items of the main
6902	* directory ("a") without flushing them first, while for each of its
6903	* subdirectories we flush their delayed items before logging them.
6904	* This prevents a potential unbounded recursion like this:
6905	*
6906	* btrfs_log_inode()
6907	* log_new_delayed_dentries()
6908	* btrfs_log_inode()
6909	* log_new_delayed_dentries()
6910	* btrfs_log_inode()
6911	* log_new_delayed_dentries()
6912	* (...)
6913	*
6914	* We have thresholds for the maximum number of delayed items to have in
6915	* memory, and once they are hit, the items are flushed asynchronously.
6916	* However the limit is quite high, so lets prevent deep levels of
6917	* recursion to happen by limiting the maximum depth to be 1.
6918	*/
6919	if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6920	ret = btrfs_commit_inode_delayed_items(trans, inode);
6921	if (ret)
6922	goto out;
6923	}
6924
6925	mutex_lock(&inode->log_mutex);
6926
6927	/*
6928	* For symlinks, we must always log their content, which is stored in an
6929	* inline extent, otherwise we could end up with an empty symlink after
6930	* log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6931	* one attempts to create an empty symlink).
6932	* We don't need to worry about flushing delalloc, because when we create
6933	* the inline extent when the symlink is created (we never have delalloc
6934	* for symlinks).
6935	*/
6936	if (S_ISLNK(inode->vfs_inode.i_mode))
6937	inode_only = LOG_INODE_ALL;
6938
6939	/*
6940	* Before logging the inode item, cache the value returned by
6941	* inode_logged(), because after that we have the need to figure out if
6942	* the inode was previously logged in this transaction.
6943	*/
6944	ret = inode_logged(trans, inode, path_in: path);
6945	if (ret < `0`)
6946	goto out_unlock;
6947	ctx->logged_before = (ret == `1`);
6948	ret = `0`;
6949
6950	/*
6951	* This is for cases where logging a directory could result in losing a
6952	* a file after replaying the log. For example, if we move a file from a
6953	* directory A to a directory B, then fsync directory A, we have no way
6954	* to known the file was moved from A to B, so logging just A would
6955	* result in losing the file after a log replay.
6956	*/
6957	if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6958	ret = BTRFS_LOG_FORCE_COMMIT;
6959	goto out_unlock;
6960	}
6961
6962	/*
6963	* a brute force approach to making sure we get the most uptodate
6964	* copies of everything.
6965	*/
6966	if (S_ISDIR(inode->vfs_inode.i_mode)) {
6967	clear_bit(nr: BTRFS_INODE_COPY_EVERYTHING, addr: &inode->runtime_flags);
6968	if (ctx->logged_before)
6969	ret = drop_inode_items(trans, log, path, inode,
6970	BTRFS_XATTR_ITEM_KEY);
6971	} else {
6972	if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6973	/*
6974	* Make sure the new inode item we write to the log has
6975	* the same isize as the current one (if it exists).
6976	* This is necessary to prevent data loss after log
6977	* replay, and also to prevent doing a wrong expanding
6978	* truncate - for e.g. create file, write 4K into offset
6979	* 0, fsync, write 4K into offset 4096, add hard link,
6980	* fsync some other file (to sync log), power fail - if
6981	* we use the inode's current i_size, after log replay
6982	* we get a 8Kb file, with the last 4Kb extent as a hole
6983	* (zeroes), as if an expanding truncate happened,
6984	* instead of getting a file of 4Kb only.
6985	*/
6986	ret = logged_inode_size(log, inode, path, size_ret: &logged_isize);
6987	if (ret)
6988	goto out_unlock;
6989	}
6990	if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6991	&inode->runtime_flags)) {
6992	if (inode_only == LOG_INODE_EXISTS) {
6993	max_key.type = BTRFS_XATTR_ITEM_KEY;
6994	if (ctx->logged_before)
6995	ret = drop_inode_items(trans, log, path,
6996	inode, max_key_type: max_key.type);
6997	} else {
6998	clear_bit(nr: BTRFS_INODE_NEEDS_FULL_SYNC,
6999	addr: &inode->runtime_flags);
7000	clear_bit(nr: BTRFS_INODE_COPY_EVERYTHING,
7001	addr: &inode->runtime_flags);
7002	if (ctx->logged_before)
7003	ret = truncate_inode_items(trans, log_root: log,
7004	inode, new_size: `0`, min_type: `0`);
7005	}
7006	} else if (test_and_clear_bit(nr: BTRFS_INODE_COPY_EVERYTHING,
7007	addr: &inode->runtime_flags) \|\|
7008	inode_only == LOG_INODE_EXISTS) {
7009	if (inode_only == LOG_INODE_ALL)
7010	fast_search = true;
7011	max_key.type = BTRFS_XATTR_ITEM_KEY;
7012	if (ctx->logged_before)
7013	ret = drop_inode_items(trans, log, path, inode,
7014	max_key_type: max_key.type);
7015	} else {
7016	if (inode_only == LOG_INODE_ALL)
7017	fast_search = true;
7018	inode_item_dropped = false;
7019	goto log_extents;
7020	}
7021
7022	}
7023	if (ret)
7024	goto out_unlock;
7025
7026	/*
7027	* If we are logging a directory in full mode, collect the delayed items
7028	* before iterating the subvolume tree, so that we don't miss any new
7029	* dir index items in case they get flushed while or right after we are
7030	* iterating the subvolume tree.
7031	*/
7032	if (full_dir_logging && !ctx->logging_new_delayed_dentries)
7033	btrfs_log_get_delayed_items(inode, ins_list: &delayed_ins_list,
7034	del_list: &delayed_del_list);
7035
7036	/*
7037	* If we are fsyncing a file with 0 hard links, then commit the delayed
7038	* inode because the last inode ref (or extref) item may still be in the
7039	* subvolume tree and if we log it the file will still exist after a log
7040	* replay. So commit the delayed inode to delete that last ref and we
7041	* skip logging it.
7042	*/
7043	if (inode->vfs_inode.i_nlink == `0`) {
7044	ret = btrfs_commit_inode_delayed_inode(inode);
7045	if (ret)
7046	goto out_unlock;
7047	}
7048
7049	ret = copy_inode_items_to_log(trans, inode, min_key: &min_key, max_key: &max_key,
7050	path, dst_path, logged_isize,
7051	inode_only, ctx,
7052	need_log_inode_item: &need_log_inode_item);
7053	if (ret)
7054	goto out_unlock;
7055
7056	btrfs_release_path(p: path);
7057	btrfs_release_path(p: dst_path);
7058	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
7059	if (ret)
7060	goto out_unlock;
7061	xattrs_logged = true;
7062	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
7063	btrfs_release_path(p: path);
7064	btrfs_release_path(p: dst_path);
7065	ret = btrfs_log_holes(trans, inode, path);
7066	if (ret)
7067	goto out_unlock;
7068	}
7069	log_extents:
7070	btrfs_release_path(p: path);
7071	btrfs_release_path(p: dst_path);
7072	if (need_log_inode_item) {
7073	ret = log_inode_item(trans, log, path: dst_path, inode, inode_item_dropped);
7074	if (ret)
7075	goto out_unlock;
7076	/*
7077	* If we are doing a fast fsync and the inode was logged before
7078	* in this transaction, we don't need to log the xattrs because
7079	* they were logged before. If xattrs were added, changed or
7080	* deleted since the last time we logged the inode, then we have
7081	* already logged them because the inode had the runtime flag
7082	* BTRFS_INODE_COPY_EVERYTHING set.
7083	*/
7084	if (!xattrs_logged && inode->logged_trans < trans->transid) {
7085	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
7086	if (ret)
7087	goto out_unlock;
7088	btrfs_release_path(p: path);
7089	}
7090	}
7091	if (fast_search) {
7092	ret = btrfs_log_changed_extents(trans, inode, path: dst_path, ctx);
7093	if (ret)
7094	goto out_unlock;
7095	} else if (inode_only == LOG_INODE_ALL) {
7096	struct extent_map em, n;
7097
7098	write_lock(&em_tree->lock);
7099	list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
7100	list_del_init(entry: &em->list);
7101	write_unlock(&em_tree->lock);
7102	}
7103
7104	if (full_dir_logging) {
7105	ret = log_directory_changes(trans, inode, path, dst_path, ctx);
7106	if (ret)
7107	goto out_unlock;
7108	ret = log_delayed_insertion_items(trans, inode, path,
7109	delayed_ins_list: &delayed_ins_list, ctx);
7110	if (ret)
7111	goto out_unlock;
7112	ret = log_delayed_deletion_items(trans, inode, path,
7113	delayed_del_list: &delayed_del_list, ctx);
7114	if (ret)
7115	goto out_unlock;
7116	}
7117
7118	spin_lock(lock: &inode->lock);
7119	inode->logged_trans = trans->transid;
7120	/*
7121	* Don't update last_log_commit if we logged that an inode exists.
7122	* We do this for three reasons:
7123	*
7124	* 1) We might have had buffered writes to this inode that were
7125	* flushed and had their ordered extents completed in this
7126	* transaction, but we did not previously log the inode with
7127	* LOG_INODE_ALL. Later the inode was evicted and after that
7128	* it was loaded again and this LOG_INODE_EXISTS log operation
7129	* happened. We must make sure that if an explicit fsync against
7130	* the inode is performed later, it logs the new extents, an
7131	* updated inode item, etc, and syncs the log. The same logic
7132	* applies to direct IO writes instead of buffered writes.
7133	*
7134	* 2) When we log the inode with LOG_INODE_EXISTS, its inode item
7135	* is logged with an i_size of 0 or whatever value was logged
7136	* before. If later the i_size of the inode is increased by a
7137	* truncate operation, the log is synced through an fsync of
7138	* some other inode and then finally an explicit fsync against
7139	* this inode is made, we must make sure this fsync logs the
7140	* inode with the new i_size, the hole between old i_size and
7141	* the new i_size, and syncs the log.
7142	*
7143	* 3) If we are logging that an ancestor inode exists as part of
7144	* logging a new name from a link or rename operation, don't update
7145	* its last_log_commit - otherwise if an explicit fsync is made
7146	* against an ancestor, the fsync considers the inode in the log
7147	* and doesn't sync the log, resulting in the ancestor missing after
7148	* a power failure unless the log was synced as part of an fsync
7149	* against any other unrelated inode.
7150	*/
7151	if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
7152	inode->last_log_commit = inode->last_sub_trans;
7153	spin_unlock(lock: &inode->lock);
7154
7155	/*
7156	* Reset the last_reflink_trans so that the next fsync does not need to
7157	* go through the slower path when logging extents and their checksums.
7158	*/
7159	if (inode_only == LOG_INODE_ALL)
7160	inode->last_reflink_trans = `0`;
7161
7162	out_unlock:
7163	mutex_unlock(lock: &inode->log_mutex);
7164	out:
7165	btrfs_free_path(p: path);
7166	btrfs_free_path(p: dst_path);
7167
7168	if (ret)
7169	free_conflicting_inodes(ctx);
7170	else
7171	ret = log_conflicting_inodes(trans, root: inode->root, ctx);
7172
7173	if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
7174	if (!ret)
7175	ret = log_new_delayed_dentries(trans, inode,
7176	delayed_ins_list: &delayed_ins_list, ctx);
7177
7178	btrfs_log_put_delayed_items(inode, ins_list: &delayed_ins_list,
7179	del_list: &delayed_del_list);
7180	}
7181
7182	return ret;
7183	}
7184
7185	static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
7186	struct btrfs_inode *inode,
7187	struct btrfs_log_ctx *ctx)
7188	{
7189	int ret;
7190	BTRFS_PATH_AUTO_FREE(path);
7191	struct btrfs_key key;
7192	struct btrfs_root *root = inode->root;
7193	const u64 ino = btrfs_ino(inode);
7194
7195	path = btrfs_alloc_path();
7196	if (!path)
7197	return -ENOMEM;
7198	path->skip_locking = true;
7199	path->search_commit_root = true;
7200
7201	key.objectid = ino;
7202	key.type = BTRFS_INODE_REF_KEY;
7203	key.offset = `0`;
7204	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
7205	if (ret < `0`)
7206	return ret;
7207
7208	while (true) {
7209	struct extent_buffer *leaf = path->nodes[`0`];
7210	int slot = path->slots[`0`];
7211	u32 cur_offset = `0`;
7212	u32 item_size;
7213	unsigned long ptr;
7214
7215	if (slot >= btrfs_header_nritems(eb: leaf)) {
7216	ret = btrfs_next_leaf(root, path);
7217	if (ret < `0`)
7218	return ret;
7219	if (ret > `0`)
7220	break;
7221	continue;
7222	}
7223
7224	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
7225	/ BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 /
7226	if (key.objectid != ino \|\| key.type > BTRFS_INODE_EXTREF_KEY)
7227	break;
7228
7229	item_size = btrfs_item_size(eb: leaf, slot);
7230	ptr = btrfs_item_ptr_offset(leaf, slot);
7231	while (cur_offset < item_size) {
7232	u64 dir_id;
7233	struct btrfs_inode *dir_inode;
7234
7235	if (key.type == BTRFS_INODE_EXTREF_KEY) {
7236	struct btrfs_inode_extref *extref;
7237
7238	extref = (struct btrfs_inode_extref *)
7239	(ptr + cur_offset);
7240	dir_id = btrfs_inode_extref_parent(eb: leaf, s: extref);
7241	cur_offset += sizeof(*extref);
7242	cur_offset += btrfs_inode_extref_name_len(eb: leaf,
7243	s: extref);
7244	} else {
7245	dir_id = key.offset;
7246	cur_offset = item_size;
7247	}
7248
7249	dir_inode = btrfs_iget_logging(objectid: dir_id, root);
7250	/*
7251	* If the parent inode was deleted, return an error to
7252	* fallback to a transaction commit. This is to prevent
7253	* getting an inode that was moved from one parent A to
7254	* a parent B, got its former parent A deleted and then
7255	* it got fsync'ed, from existing at both parents after
7256	* a log replay (and the old parent still existing).
7257	* Example:
7258	*
7259	* mkdir /mnt/A
7260	* mkdir /mnt/B
7261	* touch /mnt/B/bar
7262	* sync
7263	* mv /mnt/B/bar /mnt/A/bar
7264	* mv -T /mnt/A /mnt/B
7265	* fsync /mnt/B/bar
7266	* <power fail>
7267	*
7268	* If we ignore the old parent B which got deleted,
7269	* after a log replay we would have file bar linked
7270	* at both parents and the old parent B would still
7271	* exist.
7272	*/
7273	if (IS_ERR(ptr: dir_inode))
7274	return PTR_ERR(ptr: dir_inode);
7275
7276	if (!need_log_inode(trans, inode: dir_inode)) {
7277	btrfs_add_delayed_iput(inode: dir_inode);
7278	continue;
7279	}
7280
7281	ctx->log_new_dentries = false;
7282	ret = btrfs_log_inode(trans, inode: dir_inode, inode_only: LOG_INODE_ALL, ctx);
7283	if (!ret && ctx->log_new_dentries)
7284	ret = log_new_dir_dentries(trans, start_inode: dir_inode, ctx);
7285	btrfs_add_delayed_iput(inode: dir_inode);
7286	if (ret)
7287	return ret;
7288	}
7289	path->slots[`0`]++;
7290	}
7291	return `0`;
7292	}
7293
7294	static int log_new_ancestors(struct btrfs_trans_handle *trans,
7295	struct btrfs_root *root,
7296	struct btrfs_path *path,
7297	struct btrfs_log_ctx *ctx)
7298	{
7299	struct btrfs_key found_key;
7300
7301	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &found_key, nr: path->slots[`0`]);
7302
7303	while (true) {
7304	struct extent_buffer *leaf;
7305	int slot;
7306	struct btrfs_key search_key;
7307	struct btrfs_inode *inode;
7308	u64 ino;
7309	int ret = `0`;
7310
7311	btrfs_release_path(p: path);
7312
7313	ino = found_key.offset;
7314
7315	search_key.objectid = found_key.offset;
7316	search_key.type = BTRFS_INODE_ITEM_KEY;
7317	search_key.offset = `0`;
7318	inode = btrfs_iget_logging(objectid: ino, root);
7319	if (IS_ERR(ptr: inode))
7320	return PTR_ERR(ptr: inode);
7321
7322	if (inode->generation >= trans->transid &&
7323	need_log_inode(trans, inode))
7324	ret = btrfs_log_inode(trans, inode, inode_only: LOG_INODE_EXISTS, ctx);
7325	btrfs_add_delayed_iput(inode);
7326	if (ret)
7327	return ret;
7328
7329	if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
7330	break;
7331
7332	search_key.type = BTRFS_INODE_REF_KEY;
7333	ret = btrfs_search_slot(NULL, root, key: &search_key, p: path, ins_len: `0`, cow: `0`);
7334	if (ret < `0`)
7335	return ret;
7336
7337	leaf = path->nodes[`0`];
7338	slot = path->slots[`0`];
7339	if (slot >= btrfs_header_nritems(eb: leaf)) {
7340	ret = btrfs_next_leaf(root, path);
7341	if (ret < `0`)
7342	return ret;
7343	else if (ret > `0`)
7344	return -ENOENT;
7345	leaf = path->nodes[`0`];
7346	slot = path->slots[`0`];
7347	}
7348
7349	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: slot);
7350	if (found_key.objectid != search_key.objectid \|\|
7351	found_key.type != BTRFS_INODE_REF_KEY)
7352	return -ENOENT;
7353	}
7354	return `0`;
7355	}
7356
7357	static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
7358	struct btrfs_inode *inode,
7359	struct dentry *parent,
7360	struct btrfs_log_ctx *ctx)
7361	{
7362	struct btrfs_root *root = inode->root;
7363	struct dentry *old_parent = NULL;
7364	struct super_block *sb = inode->vfs_inode.i_sb;
7365	int ret = `0`;
7366
7367	while (true) {
7368	if (!parent \|\| d_really_is_negative(dentry: parent) \|\|
7369	sb != parent->d_sb)
7370	break;
7371
7372	inode = BTRFS_I(d_inode(parent));
7373	if (root != inode->root)
7374	break;
7375
7376	if (inode->generation >= trans->transid &&
7377	need_log_inode(trans, inode)) {
7378	ret = btrfs_log_inode(trans, inode,
7379	inode_only: LOG_INODE_EXISTS, ctx);
7380	if (ret)
7381	break;
7382	}
7383	if (IS_ROOT(parent))
7384	break;
7385
7386	parent = dget_parent(dentry: parent);
7387	dput(old_parent);
7388	old_parent = parent;
7389	}
7390	dput(old_parent);
7391
7392	return ret;
7393	}
7394
7395	static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
7396	struct btrfs_inode *inode,
7397	struct dentry *parent,
7398	struct btrfs_log_ctx *ctx)
7399	{
7400	struct btrfs_root *root = inode->root;
7401	const u64 ino = btrfs_ino(inode);
7402	BTRFS_PATH_AUTO_FREE(path);
7403	struct btrfs_key search_key;
7404	int ret;
7405
7406	/*
7407	* For a single hard link case, go through a fast path that does not
7408	* need to iterate the fs/subvolume tree.
7409	*/
7410	if (inode->vfs_inode.i_nlink < `2`)
7411	return log_new_ancestors_fast(trans, inode, parent, ctx);
7412
7413	path = btrfs_alloc_path();
7414	if (!path)
7415	return -ENOMEM;
7416
7417	search_key.objectid = ino;
7418	search_key.type = BTRFS_INODE_REF_KEY;
7419	search_key.offset = `0`;
7420	again:
7421	ret = btrfs_search_slot(NULL, root, key: &search_key, p: path, ins_len: `0`, cow: `0`);
7422	if (ret < `0`)
7423	return ret;
7424	if (ret == `0`)
7425	path->slots[`0`]++;
7426
7427	while (true) {
7428	struct extent_buffer *leaf = path->nodes[`0`];
7429	int slot = path->slots[`0`];
7430	struct btrfs_key found_key;
7431
7432	if (slot >= btrfs_header_nritems(eb: leaf)) {
7433	ret = btrfs_next_leaf(root, path);
7434	if (ret < `0`)
7435	return ret;
7436	if (ret > `0`)
7437	break;
7438	continue;
7439	}
7440
7441	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: slot);
7442	if (found_key.objectid != ino \|\|
7443	found_key.type > BTRFS_INODE_EXTREF_KEY)
7444	break;
7445
7446	/*
7447	* Don't deal with extended references because they are rare
7448	* cases and too complex to deal with (we would need to keep
7449	* track of which subitem we are processing for each item in
7450	* this loop, etc). So just return some error to fallback to
7451	* a transaction commit.
7452	*/
7453	if (found_key.type == BTRFS_INODE_EXTREF_KEY)
7454	return -EMLINK;
7455
7456	/*
7457	* Logging ancestors needs to do more searches on the fs/subvol
7458	* tree, so it releases the path as needed to avoid deadlocks.
7459	* Keep track of the last inode ref key and resume from that key
7460	* after logging all new ancestors for the current hard link.
7461	*/
7462	memcpy(&search_key, &found_key, sizeof(search_key));
7463
7464	ret = log_new_ancestors(trans, root, path, ctx);
7465	if (ret)
7466	return ret;
7467	btrfs_release_path(p: path);
7468	goto again;
7469	}
7470	return `0`;
7471	}
7472
7473	/*
7474	* helper function around btrfs_log_inode to make sure newly created
7475	* parent directories also end up in the log. A minimal inode and backref
7476	* only logging is done of any parent directories that are older than
7477	* the last committed transaction
7478	*/
7479	static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
7480	struct btrfs_inode *inode,
7481	struct dentry *parent,
7482	int inode_only,
7483	struct btrfs_log_ctx *ctx)
7484	{
7485	struct btrfs_root *root = inode->root;
7486	struct btrfs_fs_info *fs_info = root->fs_info;
7487	int ret = `0`;
7488	bool log_dentries;
7489
7490	if (btrfs_test_opt(fs_info, NOTREELOG))
7491	return BTRFS_LOG_FORCE_COMMIT;
7492
7493	if (btrfs_root_refs(s: &root->root_item) == `0`)
7494	return BTRFS_LOG_FORCE_COMMIT;
7495
7496	/*
7497	* If we're logging an inode from a subvolume created in the current
7498	* transaction we must force a commit since the root is not persisted.
7499	*/
7500	if (btrfs_root_generation(s: &root->root_item) == trans->transid)
7501	return BTRFS_LOG_FORCE_COMMIT;
7502
7503	/ Skip already logged inodes and without new extents. /
7504	if (btrfs_inode_in_log(inode, generation: trans->transid) &&
7505	list_empty(head: &ctx->ordered_extents))
7506	return BTRFS_NO_LOG_SYNC;
7507
7508	ret = start_log_trans(trans, root, ctx);
7509	if (ret)
7510	return ret;
7511
7512	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7513	if (ret)
7514	goto end_trans;
7515
7516	/*
7517	* for regular files, if its inode is already on disk, we don't
7518	* have to worry about the parents at all. This is because
7519	* we can use the last_unlink_trans field to record renames
7520	* and other fun in this file.
7521	*/
7522	if (S_ISREG(inode->vfs_inode.i_mode) &&
7523	inode->generation < trans->transid &&
7524	inode->last_unlink_trans < trans->transid) {
7525	ret = `0`;
7526	goto end_trans;
7527	}
7528
7529	/*
7530	* Track if we need to log dentries because ctx->log_new_dentries can
7531	* be modified in the call chains below.
7532	*/
7533	log_dentries = ctx->log_new_dentries;
7534
7535	/*
7536	* On unlink we must make sure all our current and old parent directory
7537	* inodes are fully logged. This is to prevent leaving dangling
7538	* directory index entries in directories that were our parents but are
7539	* not anymore. Not doing this results in old parent directory being
7540	* impossible to delete after log replay (rmdir will always fail with
7541	* error -ENOTEMPTY).
7542	*
7543	* Example 1:
7544	*
7545	* mkdir testdir
7546	* touch testdir/foo
7547	* ln testdir/foo testdir/bar
7548	* sync
7549	* unlink testdir/bar
7550	* xfs_io -c fsync testdir/foo
7551	* <power failure>
7552	* mount fs, triggers log replay
7553	*
7554	* If we don't log the parent directory (testdir), after log replay the
7555	* directory still has an entry pointing to the file inode using the bar
7556	* name, but a matching BTRFS_INODE_[REF\|EXTREF]_KEY does not exist and
7557	* the file inode has a link count of 1.
7558	*
7559	* Example 2:
7560	*
7561	* mkdir testdir
7562	* touch foo
7563	* ln foo testdir/foo2
7564	* ln foo testdir/foo3
7565	* sync
7566	* unlink testdir/foo3
7567	* xfs_io -c fsync foo
7568	* <power failure>
7569	* mount fs, triggers log replay
7570	*
7571	* Similar as the first example, after log replay the parent directory
7572	* testdir still has an entry pointing to the inode file with name foo3
7573	* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7574	* and has a link count of 2.
7575	*/
7576	if (inode->last_unlink_trans >= trans->transid) {
7577	ret = btrfs_log_all_parents(trans, inode, ctx);
7578	if (ret)
7579	goto end_trans;
7580	}
7581
7582	ret = log_all_new_ancestors(trans, inode, parent, ctx);
7583	if (ret)
7584	goto end_trans;
7585
7586	if (log_dentries)
7587	ret = log_new_dir_dentries(trans, start_inode: inode, ctx);
7588	end_trans:
7589	if (ret < `0`) {
7590	btrfs_set_log_full_commit(trans);
7591	ret = BTRFS_LOG_FORCE_COMMIT;
7592	}
7593
7594	if (ret)
7595	btrfs_remove_log_ctx(root, ctx);
7596	btrfs_end_log_trans(root);
7597
7598	return ret;
7599	}
7600
7601	/*
7602	* it is not safe to log dentry if the chunk root has added new
7603	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
7604	* If this returns 1, you must commit the transaction to safely get your
7605	* data on disk.
7606	*/
7607	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7608	struct dentry *dentry,
7609	struct btrfs_log_ctx *ctx)
7610	{
7611	struct dentry *parent = dget_parent(dentry);
7612	int ret;
7613
7614	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7615	inode_only: LOG_INODE_ALL, ctx);
7616	dput(parent);
7617
7618	return ret;
7619	}
7620
7621	/*
7622	* should be called during mount to recover any replay any log trees
7623	* from the FS
7624	*/
7625	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7626	{
7627	int ret;
7628	struct btrfs_path *path;
7629	struct btrfs_trans_handle *trans;
7630	struct btrfs_key key;
7631	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7632	struct walk_control wc = {
7633	.process_func = process_one_buffer,
7634	.stage = LOG_WALK_PIN_ONLY,
7635	};
7636
7637	path = btrfs_alloc_path();
7638	if (!path)
7639	return -ENOMEM;
7640
7641	set_bit(nr: BTRFS_FS_LOG_RECOVERING, addr: &fs_info->flags);
7642
7643	trans = btrfs_start_transaction(root: fs_info->tree_root, num_items: `0`);
7644	if (IS_ERR(ptr: trans)) {
7645	ret = PTR_ERR(ptr: trans);
7646	goto error;
7647	}
7648
7649	wc.trans = trans;
7650	wc.pin = true;
7651	wc.log = log_root_tree;
7652
7653	ret = walk_log_tree(wc: &wc);
7654	wc.log = NULL;
7655	if (unlikely(ret)) {
7656	btrfs_abort_transaction(trans, ret);
7657	goto error;
7658	}
7659
7660	again:
7661	key.objectid = BTRFS_TREE_LOG_OBJECTID;
7662	key.type = BTRFS_ROOT_ITEM_KEY;
7663	key.offset = (u64)-`1`;
7664
7665	while (`1`) {
7666	struct btrfs_key found_key;
7667
7668	ret = btrfs_search_slot(NULL, root: log_root_tree, key: &key, p: path, ins_len: `0`, cow: `0`);
7669
7670	if (unlikely(ret < `0`)) {
7671	btrfs_abort_transaction(trans, ret);
7672	goto error;
7673	}
7674	if (ret > `0`) {
7675	if (path->slots[`0`] == `0`)
7676	break;
7677	path->slots[`0`]--;
7678	}
7679	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &found_key,
7680	nr: path->slots[`0`]);
7681	btrfs_release_path(p: path);
7682	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7683	break;
7684
7685	wc.log = btrfs_read_tree_root(tree_root: log_root_tree, key: &found_key);
7686	if (IS_ERR(ptr: wc.log)) {
7687	ret = PTR_ERR(ptr: wc.log);
7688	wc.log = NULL;
7689	btrfs_abort_transaction(trans, ret);
7690	goto error;
7691	}
7692
7693	wc.root = btrfs_get_fs_root(fs_info, objectid: found_key.offset, check_ref: true);
7694	if (IS_ERR(ptr: wc.root)) {
7695	ret = PTR_ERR(ptr: wc.root);
7696	wc.root = NULL;
7697	if (unlikely(ret != -ENOENT)) {
7698	btrfs_abort_transaction(trans, ret);
7699	goto error;
7700	}
7701
7702	/*
7703	* We didn't find the subvol, likely because it was
7704	* deleted. This is ok, simply skip this log and go to
7705	* the next one.
7706	*
7707	* We need to exclude the root because we can't have
7708	* other log replays overwriting this log as we'll read
7709	* it back in a few more times. This will keep our
7710	* block from being modified, and we'll just bail for
7711	* each subsequent pass.
7712	*/
7713	ret = btrfs_pin_extent_for_log_replay(trans, eb: wc.log->node);
7714	if (unlikely(ret)) {
7715	btrfs_abort_transaction(trans, ret);
7716	goto error;
7717	}
7718	goto next;
7719	}
7720
7721	wc.root->log_root = wc.log;
7722	ret = btrfs_record_root_in_trans(trans, root: wc.root);
7723	if (unlikely(ret)) {
7724	btrfs_abort_transaction(trans, ret);
7725	goto next;
7726	}
7727
7728	ret = walk_log_tree(wc: &wc);
7729	if (unlikely(ret)) {
7730	btrfs_abort_transaction(trans, ret);
7731	goto next;
7732	}
7733
7734	if (wc.stage == LOG_WALK_REPLAY_ALL) {
7735	struct btrfs_root *root = wc.root;
7736
7737	wc.subvol_path = path;
7738	ret = fixup_inode_link_counts(wc: &wc);
7739	wc.subvol_path = NULL;
7740	if (unlikely(ret)) {
7741	btrfs_abort_transaction(trans, ret);
7742	goto next;
7743	}
7744	/*
7745	* We have just replayed everything, and the highest
7746	* objectid of fs roots probably has changed in case
7747	* some inode_item's got replayed.
7748	*
7749	* root->objectid_mutex is not acquired as log replay
7750	* could only happen during mount.
7751	*/
7752	ret = btrfs_init_root_free_objectid(root);
7753	if (unlikely(ret)) {
7754	btrfs_abort_transaction(trans, ret);
7755	goto next;
7756	}
7757	}
7758	next:
7759	if (wc.root) {
7760	wc.root->log_root = NULL;
7761	btrfs_put_root(root: wc.root);
7762	}
7763	btrfs_put_root(root: wc.log);
7764	wc.log = NULL;
7765
7766	if (ret)
7767	goto error;
7768	if (found_key.offset == `0`)
7769	break;
7770	key.offset = found_key.offset - `1`;
7771	}
7772	btrfs_release_path(p: path);
7773
7774	/ step one is to pin it all, step two is to replay just inodes /
7775	if (wc.pin) {
7776	wc.pin = false;
7777	wc.process_func = replay_one_buffer;
7778	wc.stage = LOG_WALK_REPLAY_INODES;
7779	goto again;
7780	}
7781	/ step three is to replay everything /
7782	if (wc.stage < LOG_WALK_REPLAY_ALL) {
7783	wc.stage++;
7784	goto again;
7785	}
7786
7787	btrfs_free_path(p: path);
7788
7789	/ step 4: commit the transaction, which also unpins the blocks /
7790	ret = btrfs_commit_transaction(trans);
7791	if (ret)
7792	return ret;
7793
7794	clear_bit(nr: BTRFS_FS_LOG_RECOVERING, addr: &fs_info->flags);
7795
7796	return `0`;
7797	error:
7798	if (wc.trans)
7799	btrfs_end_transaction(trans: wc.trans);
7800	btrfs_put_root(root: wc.log);
7801	clear_bit(nr: BTRFS_FS_LOG_RECOVERING, addr: &fs_info->flags);
7802	btrfs_free_path(p: path);
7803	return ret;
7804	}
7805
7806	/*
7807	* there are some corner cases where we want to force a full
7808	* commit instead of allowing a directory to be logged.
7809	*
7810	* They revolve around files there were unlinked from the directory, and
7811	* this function updates the parent directory so that a full commit is
7812	* properly done if it is fsync'd later after the unlinks are done.
7813	*
7814	* Must be called before the unlink operations (updates to the subvolume tree,
7815	* inodes, etc) are done.
7816	*/
7817	void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7818	struct btrfs_inode dir, struct* btrfs_inode *inode,
7819	bool for_rename)
7820	{
7821	/*
7822	* when we're logging a file, if it hasn't been renamed
7823	* or unlinked, and its inode is fully committed on disk,
7824	* we don't have to worry about walking up the directory chain
7825	* to log its parents.
7826	*
7827	* So, we use the last_unlink_trans field to put this transid
7828	* into the file. When the file is logged we check it and
7829	* don't log the parents if the file is fully on disk.
7830	*/
7831	mutex_lock(&inode->log_mutex);
7832	inode->last_unlink_trans = trans->transid;
7833	mutex_unlock(lock: &inode->log_mutex);
7834
7835	if (!for_rename)
7836	return;
7837
7838	/*
7839	* If this directory was already logged, any new names will be logged
7840	* with btrfs_log_new_name() and old names will be deleted from the log
7841	* tree with btrfs_del_dir_entries_in_log() or with
7842	* btrfs_del_inode_ref_in_log().
7843	*/
7844	if (inode_logged(trans, inode: dir, NULL) == `1`)
7845	return;
7846
7847	/*
7848	* If the inode we're about to unlink was logged before, the log will be
7849	* properly updated with the new name with btrfs_log_new_name() and the
7850	* old name removed with btrfs_del_dir_entries_in_log() or with
7851	* btrfs_del_inode_ref_in_log().
7852	*/
7853	if (inode_logged(trans, inode, NULL) == `1`)
7854	return;
7855
7856	/*
7857	* when renaming files across directories, if the directory
7858	* there we're unlinking from gets fsync'd later on, there's
7859	* no way to find the destination directory later and fsync it
7860	* properly. So, we have to be conservative and force commits
7861	* so the new name gets discovered.
7862	*/
7863	mutex_lock(&dir->log_mutex);
7864	dir->last_unlink_trans = trans->transid;
7865	mutex_unlock(lock: &dir->log_mutex);
7866	}
7867
7868	/*
7869	* Make sure that if someone attempts to fsync the parent directory of a deleted
7870	* snapshot, it ends up triggering a transaction commit. This is to guarantee
7871	* that after replaying the log tree of the parent directory's root we will not
7872	* see the snapshot anymore and at log replay time we will not see any log tree
7873	* corresponding to the deleted snapshot's root, which could lead to replaying
7874	* it after replaying the log tree of the parent directory (which would replay
7875	* the snapshot delete operation).
7876	*
7877	* Must be called before the actual snapshot destroy operation (updates to the
7878	* parent root and tree of tree roots trees, etc) are done.
7879	*/
7880	void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7881	struct btrfs_inode *dir)
7882	{
7883	mutex_lock(&dir->log_mutex);
7884	dir->last_unlink_trans = trans->transid;
7885	mutex_unlock(lock: &dir->log_mutex);
7886	}
7887
7888	/*
7889	* Call this when creating a subvolume in a directory.
7890	* Because we don't commit a transaction when creating a subvolume, we can't
7891	* allow the directory pointing to the subvolume to be logged with an entry that
7892	* points to an unpersisted root if we are still in the transaction used to
7893	* create the subvolume, so make any attempt to log the directory to result in a
7894	* full log sync.
7895	* Also we don't need to worry with renames, since btrfs_rename() marks the log
7896	* for full commit when renaming a subvolume.
7897	*
7898	* Must be called before creating the subvolume entry in its parent directory.
7899	*/
7900	void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
7901	struct btrfs_inode *dir)
7902	{
7903	mutex_lock(&dir->log_mutex);
7904	dir->last_unlink_trans = trans->transid;
7905	mutex_unlock(lock: &dir->log_mutex);
7906	}
7907
7908	/*
7909	* Update the log after adding a new name for an inode.
7910	*
7911	* @trans: Transaction handle.
7912	* @old_dentry: The dentry associated with the old name and the old
7913	* parent directory.
7914	* @old_dir: The inode of the previous parent directory for the case
7915	* of a rename. For a link operation, it must be NULL.
7916	* @old_dir_index: The index number associated with the old name, meaningful
7917	* only for rename operations (when @old_dir is not NULL).
7918	* Ignored for link operations.
7919	* @parent: The dentry associated with the directory under which the
7920	* new name is located.
7921	*
7922	* Call this after adding a new name for an inode, as a result of a link or
7923	* rename operation, and it will properly update the log to reflect the new name.
7924	*/
7925	void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7926	struct dentry old_dentry, struct* btrfs_inode *old_dir,
7927	u64 old_dir_index, struct dentry *parent)
7928	{
7929	struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7930	struct btrfs_root *root = inode->root;
7931	struct btrfs_log_ctx ctx;
7932	bool log_pinned = false;
7933	int ret;
7934
7935	/ The inode has a new name (ref/extref), so make sure we log it. /
7936	set_bit(nr: BTRFS_INODE_COPY_EVERYTHING, addr: &inode->runtime_flags);
7937
7938	btrfs_init_log_ctx(ctx: &ctx, inode);
7939	ctx.logging_new_name = true;
7940
7941	/*
7942	* this will force the logging code to walk the dentry chain
7943	* up for the file
7944	*/
7945	if (!S_ISDIR(inode->vfs_inode.i_mode))
7946	inode->last_unlink_trans = trans->transid;
7947
7948	/*
7949	* if this inode hasn't been logged and directory we're renaming it
7950	* from hasn't been logged, we don't need to log it
7951	*/
7952	ret = inode_logged(trans, inode, NULL);
7953	if (ret < `0`) {
7954	goto out;
7955	} else if (ret == `0`) {
7956	if (!old_dir)
7957	return;
7958	/*
7959	* If the inode was not logged and we are doing a rename (old_dir is not
7960	* NULL), check if old_dir was logged - if it was not we can return and
7961	* do nothing.
7962	*/
7963	ret = inode_logged(trans, inode: old_dir, NULL);
7964	if (ret < `0`)
7965	goto out;
7966	else if (ret == `0`)
7967	return;
7968	}
7969	ret = `0`;
7970
7971	/*
7972	* Now that we know we need to update the log, allocate the scratch eb
7973	* for the context before joining a log transaction below, as this can
7974	* take time and therefore we could delay log commits from other tasks.
7975	*/
7976	btrfs_init_log_ctx_scratch_eb(ctx: &ctx);
7977
7978	/*
7979	* If we are doing a rename (old_dir is not NULL) from a directory that
7980	* was previously logged, make sure that on log replay we get the old
7981	* dir entry deleted. This is needed because we will also log the new
7982	* name of the renamed inode, so we need to make sure that after log
7983	* replay we don't end up with both the new and old dir entries existing.
7984	*/
7985	if (old_dir && old_dir->logged_trans == trans->transid) {
7986	struct btrfs_root *log = old_dir->root->log_root;
7987	struct btrfs_path *path;
7988	struct fscrypt_name fname;
7989
7990	ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX,
7991	"old_dir_index=%llu", old_dir_index);
7992
7993	ret = fscrypt_setup_filename(inode: &old_dir->vfs_inode,
7994	iname: &old_dentry->d_name, lookup: `0`, fname: &fname);
7995	if (ret)
7996	goto out;
7997
7998	path = btrfs_alloc_path();
7999	if (!path) {
8000	ret = -ENOMEM;
8001	fscrypt_free_filename(fname: &fname);
8002	goto out;
8003	}
8004
8005	/*
8006	* We have two inodes to update in the log, the old directory and
8007	* the inode that got renamed, so we must pin the log to prevent
8008	* anyone from syncing the log until we have updated both inodes
8009	* in the log.
8010	*/
8011	ret = join_running_log_trans(root);
8012	/*
8013	* At least one of the inodes was logged before, so this should
8014	* not fail, but if it does, it's not serious, just bail out and
8015	* mark the log for a full commit.
8016	*/
8017	if (WARN_ON_ONCE(ret < `0`)) {
8018	btrfs_free_path(p: path);
8019	fscrypt_free_filename(fname: &fname);
8020	goto out;
8021	}
8022
8023	log_pinned = true;
8024
8025	/*
8026	* Other concurrent task might be logging the old directory,
8027	* as it can be triggered when logging other inode that had or
8028	* still has a dentry in the old directory. We lock the old
8029	* directory's log_mutex to ensure the deletion of the old
8030	* name is persisted, because during directory logging we
8031	* delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
8032	* the old name's dir index item is in the delayed items, so
8033	* it could be missed by an in progress directory logging.
8034	*/
8035	mutex_lock(&old_dir->log_mutex);
8036	ret = del_logged_dentry(trans, log, path, dir_ino: btrfs_ino(inode: old_dir),
8037	name: &fname.disk_name, index: old_dir_index);
8038	if (ret > `0`) {
8039	/*
8040	* The dentry does not exist in the log, so record its
8041	* deletion.
8042	*/
8043	btrfs_release_path(p: path);
8044	ret = insert_dir_log_key(trans, log, path,
8045	dirid: btrfs_ino(inode: old_dir),
8046	first_offset: old_dir_index, last_offset: old_dir_index);
8047	}
8048	mutex_unlock(lock: &old_dir->log_mutex);
8049
8050	btrfs_free_path(p: path);
8051	fscrypt_free_filename(fname: &fname);
8052	if (ret < `0`)
8053	goto out;
8054	}
8055
8056	/*
8057	* We don't care about the return value. If we fail to log the new name
8058	* then we know the next attempt to sync the log will fallback to a full
8059	* transaction commit (due to a call to btrfs_set_log_full_commit()), so
8060	* we don't need to worry about getting a log committed that has an
8061	* inconsistent state after a rename operation.
8062	*/
8063	btrfs_log_inode_parent(trans, inode, parent, inode_only: LOG_INODE_EXISTS, ctx: &ctx);
8064	ASSERT(list_empty(&ctx.conflict_inodes));
8065	out:
8066	/*
8067	* If an error happened mark the log for a full commit because it's not
8068	* consistent and up to date or we couldn't find out if one of the
8069	* inodes was logged before in this transaction. Do it before unpinning
8070	* the log, to avoid any races with someone else trying to commit it.
8071	*/
8072	if (ret < `0`)
8073	btrfs_set_log_full_commit(trans);
8074	if (log_pinned)
8075	btrfs_end_log_trans(root);
8076	free_extent_buffer(eb: ctx.scratch_eb);
8077	}
8078
8079

source code of linux/fs/btrfs/tree-log.c