blk-zoned.c source code [linux/block/blk-zoned.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Zoned block device handling
4	*
5	* Copyright (c) 2015, Hannes Reinecke
6	* Copyright (c) 2015, SUSE Linux GmbH
7	*
8	* Copyright (c) 2016, Damien Le Moal
9	* Copyright (c) 2016, Western Digital
10	* Copyright (c) 2024, Western Digital Corporation or its affiliates.
11	*/
12
13	#include <linux/kernel.h>
14	#include <linux/blkdev.h>
15	#include <linux/blk-mq.h>
16	#include <linux/spinlock.h>
17	#include <linux/refcount.h>
18	#include <linux/mempool.h>
19
20	#include <trace/events/block.h>
21
22	#include "blk.h"
23	#include "blk-mq-sched.h"
24	#include "blk-mq-debugfs.h"
25
26	#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
27	static const char *const zone_cond_name[] = {
28	ZONE_COND_NAME(NOT_WP),
29	ZONE_COND_NAME(EMPTY),
30	ZONE_COND_NAME(IMP_OPEN),
31	ZONE_COND_NAME(EXP_OPEN),
32	ZONE_COND_NAME(CLOSED),
33	ZONE_COND_NAME(READONLY),
34	ZONE_COND_NAME(FULL),
35	ZONE_COND_NAME(OFFLINE),
36	ZONE_COND_NAME(ACTIVE),
37	};
38	#undef ZONE_COND_NAME
39
40	/*
41	* Per-zone write plug.
42	* @node: hlist_node structure for managing the plug using a hash table.
43	* @bio_list: The list of BIOs that are currently plugged.
44	* @bio_work: Work struct to handle issuing of plugged BIOs
45	* @rcu_head: RCU head to free zone write plugs with an RCU grace period.
46	* @disk: The gendisk the plug belongs to.
47	* @lock: Spinlock to atomically manipulate the plug.
48	* @ref: Zone write plug reference counter. A zone write plug reference is
49	* always at least 1 when the plug is hashed in the disk plug hash table.
50	* The reference is incremented whenever a new BIO needing plugging is
51	* submitted and when a function needs to manipulate a plug. The
52	* reference count is decremented whenever a plugged BIO completes and
53	* when a function that referenced the plug returns. The initial
54	* reference is dropped whenever the zone of the zone write plug is reset,
55	* finished and when the zone becomes full (last write BIO to the zone
56	* completes).
57	* @flags: Flags indicating the plug state.
58	* @zone_no: The number of the zone the plug is managing.
59	* @wp_offset: The zone write pointer location relative to the start of the zone
60	* as a number of 512B sectors.
61	* @cond: Condition of the zone
62	*/
63	struct blk_zone_wplug {
64	struct hlist_node node;
65	struct bio_list bio_list;
66	struct work_struct bio_work;
67	struct rcu_head rcu_head;
68	struct gendisk *disk;
69	spinlock_t lock;
70	refcount_t ref;
71	unsigned int flags;
72	unsigned int zone_no;
73	unsigned int wp_offset;
74	enum blk_zone_cond cond;
75	};
76
77	static inline bool disk_need_zone_resources(struct gendisk *disk)
78	{
79	/*
80	* All request-based zoned devices need zone resources so that the
81	* block layer can automatically handle write BIO plugging. BIO-based
82	* device drivers (e.g. DM devices) are normally responsible for
83	* handling zone write ordering and do not need zone resources, unless
84	* the driver requires zone append emulation.
85	*/
86	return queue_is_mq(q: disk->queue) \|\|
87	queue_emulates_zone_append(q: disk->queue);
88	}
89
90	static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
91	{
92	return `1U` << disk->zone_wplugs_hash_bits;
93	}
94
95	/*
96	* Zone write plug flags bits:
97	* - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
98	* that is, that write BIOs are being throttled due to a write BIO already
99	* being executed or the zone write plug bio list is not empty.
100	* - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
101	* write pointer offset and need to update it.
102	* - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
103	* from the disk hash table and that the initial reference to the zone
104	* write plug set when the plug was first added to the hash table has been
105	* dropped. This flag is set when a zone is reset, finished or become full,
106	* to prevent new references to the zone write plug to be taken for
107	* newly incoming BIOs. A zone write plug flagged with this flag will be
108	* freed once all remaining references from BIOs or functions are dropped.
109	*/
110	#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
111	#define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1)
112	#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
113
114	/**
115	* blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
116	* @zone_cond: BLK_ZONE_COND_XXX.
117	*
118	* Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
119	* into string format. Useful in the debugging and tracing zone conditions. For
120	* invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
121	*/
122	const char blk_zone_cond_str(enum* blk_zone_cond zone_cond)
123	{
124	static const char *zone_cond_str = "UNKNOWN";
125
126	if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
127	zone_cond_str = zone_cond_name[zone_cond];
128
129	return zone_cond_str;
130	}
131	EXPORT_SYMBOL_GPL(blk_zone_cond_str);
132
133	static void blk_zone_set_cond(u8 zones_cond, unsigned* int zno,
134	enum blk_zone_cond cond)
135	{
136	if (!zones_cond)
137	return;
138
139	switch (cond) {
140	case BLK_ZONE_COND_IMP_OPEN:
141	case BLK_ZONE_COND_EXP_OPEN:
142	case BLK_ZONE_COND_CLOSED:
143	zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
144	return;
145	case BLK_ZONE_COND_NOT_WP:
146	case BLK_ZONE_COND_EMPTY:
147	case BLK_ZONE_COND_FULL:
148	case BLK_ZONE_COND_OFFLINE:
149	case BLK_ZONE_COND_READONLY:
150	default:
151	zones_cond[zno] = cond;
152	return;
153	}
154	}
155
156	static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
157	enum blk_zone_cond cond)
158	{
159	u8 *zones_cond;
160
161	rcu_read_lock();
162	zones_cond = rcu_dereference(disk->zones_cond);
163	if (zones_cond) {
164	unsigned int zno = disk_zone_no(disk, sector);
165
166	/*
167	* The condition of a conventional, readonly and offline zones
168	* never changes, so do nothing if the target zone is in one of
169	* these conditions.
170	*/
171	switch (zones_cond[zno]) {
172	case BLK_ZONE_COND_NOT_WP:
173	case BLK_ZONE_COND_READONLY:
174	case BLK_ZONE_COND_OFFLINE:
175	break;
176	default:
177	blk_zone_set_cond(zones_cond, zno, cond);
178	break;
179	}
180	}
181	rcu_read_unlock();
182	}
183
184	/**
185	* bdev_zone_is_seq - check if a sector belongs to a sequential write zone
186	* @bdev: block device to check
187	* @sector: sector number
188	*
189	* Check if @sector on @bdev is contained in a sequential write required zone.
190	*/
191	bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
192	{
193	struct gendisk *disk = bdev->bd_disk;
194	unsigned int zno = disk_zone_no(disk, sector);
195	bool is_seq = false;
196	u8 *zones_cond;
197
198	if (!bdev_is_zoned(bdev))
199	return false;
200
201	rcu_read_lock();
202	zones_cond = rcu_dereference(disk->zones_cond);
203	if (zones_cond && zno < disk->nr_zones)
204	is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
205	rcu_read_unlock();
206
207	return is_seq;
208	}
209	EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
210
211	/*
212	* Zone report arguments for block device drivers report_zones operation.
213	* @cb: report_zones_cb callback for each reported zone.
214	* @data: Private data passed to report_zones_cb.
215	*/
216	struct blk_report_zones_args {
217	report_zones_cb cb;
218	void *data;
219	bool report_active;
220	};
221
222	static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
223	unsigned int nr_zones,
224	struct blk_report_zones_args *args)
225	{
226	struct gendisk *disk = bdev->bd_disk;
227
228	if (!bdev_is_zoned(bdev) \|\| WARN_ON_ONCE(!disk->fops->report_zones))
229	return -EOPNOTSUPP;
230
231	if (!nr_zones \|\| sector >= get_capacity(disk))
232	return `0`;
233
234	return disk->fops->report_zones(disk, sector, nr_zones, args);
235	}
236
237	/**
238	* blkdev_report_zones - Get zones information
239	* @bdev: Target block device
240	* @sector: Sector from which to report zones
241	* @nr_zones: Maximum number of zones to report
242	* @cb: Callback function called for each reported zone
243	* @data: Private data for the callback
244	*
245	* Description:
246	* Get zone information starting from the zone containing @sector for at most
247	* @nr_zones, and call @cb for each zone reported by the device.
248	* To report all zones in a device starting from @sector, the BLK_ALL_ZONES
249	* constant can be passed to @nr_zones.
250	* Returns the number of zones reported by the device, or a negative errno
251	* value in case of failure.
252	*
253	* Note: The caller must use memalloc_noXX_save/restore() calls to control
254	* memory allocations done within this function.
255	*/
256	int blkdev_report_zones(struct block_device *bdev, sector_t sector,
257	unsigned int nr_zones, report_zones_cb cb, void *data)
258	{
259	struct blk_report_zones_args args = {
260	.cb = cb,
261	.data = data,
262	};
263
264	return blkdev_do_report_zones(bdev, sector, nr_zones, args: &args);
265	}
266	EXPORT_SYMBOL_GPL(blkdev_report_zones);
267
268	static int blkdev_zone_reset_all(struct block_device *bdev)
269	{
270	struct bio bio;
271
272	bio_init(bio: &bio, bdev, NULL, max_vecs: `0`, opf: REQ_OP_ZONE_RESET_ALL \| REQ_SYNC);
273	trace_blkdev_zone_mgmt(bio: &bio, nr_sectors: `0`);
274	return submit_bio_wait(bio: &bio);
275	}
276
277	/**
278	* blkdev_zone_mgmt - Execute a zone management operation on a range of zones
279	* @bdev: Target block device
280	* @op: Operation to be performed on the zones
281	* @sector: Start sector of the first zone to operate on
282	* @nr_sectors: Number of sectors, should be at least the length of one zone and
283	* must be zone size aligned.
284	*
285	* Description:
286	* Perform the specified operation on the range of zones specified by
287	* @sector..@sector+@nr_sectors. Specifying the entire disk sector range
288	* is valid, but the specified range should not contain conventional zones.
289	* The operation to execute on each zone can be a zone reset, open, close
290	* or finish request.
291	*/
292	int blkdev_zone_mgmt(struct block_device bdev, enum* req_op op,
293	sector_t sector, sector_t nr_sectors)
294	{
295	sector_t zone_sectors = bdev_zone_sectors(bdev);
296	sector_t capacity = bdev_nr_sectors(bdev);
297	sector_t end_sector = sector + nr_sectors;
298	struct bio *bio = NULL;
299	int ret = `0`;
300
301	if (!bdev_is_zoned(bdev))
302	return -EOPNOTSUPP;
303
304	if (bdev_read_only(bdev))
305	return -EPERM;
306
307	if (!op_is_zone_mgmt(op))
308	return -EOPNOTSUPP;
309
310	if (end_sector <= sector \|\| end_sector > capacity)
311	/ Out of range /
312	return -EINVAL;
313
314	/ Check alignment (handle eventual smaller last zone) /
315	if (!bdev_is_zone_start(bdev, sector))
316	return -EINVAL;
317
318	if (!bdev_is_zone_start(bdev, sector: nr_sectors) && end_sector != capacity)
319	return -EINVAL;
320
321	/*
322	* In the case of a zone reset operation over all zones, use
323	* REQ_OP_ZONE_RESET_ALL.
324	*/
325	if (op == REQ_OP_ZONE_RESET && sector == `0` && nr_sectors == capacity)
326	return blkdev_zone_reset_all(bdev);
327
328	while (sector < end_sector) {
329	bio = blk_next_bio(bio, bdev, nr_pages: `0`, opf: op \| REQ_SYNC, GFP_KERNEL);
330	bio->bi_iter.bi_sector = sector;
331	sector += zone_sectors;
332
333	/ This may take a while, so be nice to others /
334	cond_resched();
335	}
336
337	trace_blkdev_zone_mgmt(bio, nr_sectors);
338	ret = submit_bio_wait(bio);
339	bio_put(bio);
340
341	return ret;
342	}
343	EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
344
345	struct zone_report_args {
346	struct blk_zone __user *zones;
347	};
348
349	static int blkdev_copy_zone_to_user(struct blk_zone zone, unsigned* int idx,
350	void *data)
351	{
352	struct zone_report_args *args = data;
353
354	if (copy_to_user(to: &args->zones[idx], from: zone, n: sizeof(struct blk_zone)))
355	return -EFAULT;
356	return `0`;
357	}
358
359	/*
360	* Mask of valid input flags for BLKREPORTZONEV2 ioctl.
361	*/
362	#define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED
363
364	/*
365	* BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
366	* Called from blkdev_ioctl.
367	*/
368	int blkdev_report_zones_ioctl(struct block_device bdev, unsigned* int cmd,
369	unsigned long arg)
370	{
371	void __user argp = (void* __user *)arg;
372	struct zone_report_args args;
373	struct blk_zone_report rep;
374	int ret;
375
376	if (!argp)
377	return -EINVAL;
378
379	if (!bdev_is_zoned(bdev))
380	return -ENOTTY;
381
382	if (copy_from_user(to: &rep, from: argp, n: sizeof(struct blk_zone_report)))
383	return -EFAULT;
384
385	if (!rep.nr_zones)
386	return -EINVAL;
387
388	args.zones = argp + sizeof(struct blk_zone_report);
389
390	switch (cmd) {
391	case BLKREPORTZONE:
392	ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
393	blkdev_copy_zone_to_user, &args);
394	break;
395	case BLKREPORTZONEV2:
396	if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
397	return -EINVAL;
398	ret = blkdev_report_zones_cached(bdev, sector: rep.sector, nr_zones: rep.nr_zones,
399	cb: blkdev_copy_zone_to_user, data: &args);
400	break;
401	default:
402	return -EINVAL;
403	}
404
405	if (ret < `0`)
406	return ret;
407
408	rep.nr_zones = ret;
409	rep.flags = BLK_ZONE_REP_CAPACITY;
410	if (copy_to_user(to: argp, from: &rep, n: sizeof(struct blk_zone_report)))
411	return -EFAULT;
412	return `0`;
413	}
414
415	static int blkdev_truncate_zone_range(struct block_device *bdev,
416	blk_mode_t mode, const struct blk_zone_range *zrange)
417	{
418	loff_t start, end;
419
420	if (zrange->sector + zrange->nr_sectors <= zrange->sector \|\|
421	zrange->sector + zrange->nr_sectors > get_capacity(disk: bdev->bd_disk))
422	/ Out of range /
423	return -EINVAL;
424
425	start = zrange->sector << SECTOR_SHIFT;
426	end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - `1`;
427
428	return truncate_bdev_range(bdev, mode, lstart: start, lend: end);
429	}
430
431	/*
432	* BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
433	* Called from blkdev_ioctl.
434	*/
435	int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
436	unsigned int cmd, unsigned long arg)
437	{
438	void __user argp = (void* __user *)arg;
439	struct blk_zone_range zrange;
440	enum req_op op;
441	int ret;
442
443	if (!argp)
444	return -EINVAL;
445
446	if (!bdev_is_zoned(bdev))
447	return -ENOTTY;
448
449	if (!(mode & BLK_OPEN_WRITE))
450	return -EBADF;
451
452	if (copy_from_user(to: &zrange, from: argp, n: sizeof(struct blk_zone_range)))
453	return -EFAULT;
454
455	switch (cmd) {
456	case BLKRESETZONE:
457	op = REQ_OP_ZONE_RESET;
458
459	/ Invalidate the page cache, including dirty pages. /
460	inode_lock(inode: bdev->bd_mapping->host);
461	filemap_invalidate_lock(mapping: bdev->bd_mapping);
462	ret = blkdev_truncate_zone_range(bdev, mode, zrange: &zrange);
463	if (ret)
464	goto fail;
465	break;
466	case BLKOPENZONE:
467	op = REQ_OP_ZONE_OPEN;
468	break;
469	case BLKCLOSEZONE:
470	op = REQ_OP_ZONE_CLOSE;
471	break;
472	case BLKFINISHZONE:
473	op = REQ_OP_ZONE_FINISH;
474	break;
475	default:
476	return -ENOTTY;
477	}
478
479	ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
480
481	fail:
482	if (cmd == BLKRESETZONE) {
483	filemap_invalidate_unlock(mapping: bdev->bd_mapping);
484	inode_unlock(inode: bdev->bd_mapping->host);
485	}
486
487	return ret;
488	}
489
490	static bool disk_zone_is_last(struct gendisk disk, struct* blk_zone *zone)
491	{
492	return zone->start + zone->len >= get_capacity(disk);
493	}
494
495	static bool disk_zone_is_full(struct gendisk *disk,
496	unsigned int zno, unsigned int offset_in_zone)
497	{
498	if (zno < disk->nr_zones - `1`)
499	return offset_in_zone >= disk->zone_capacity;
500	return offset_in_zone >= disk->last_zone_capacity;
501	}
502
503	static bool disk_zone_wplug_is_full(struct gendisk *disk,
504	struct blk_zone_wplug *zwplug)
505	{
506	return disk_zone_is_full(disk, zno: zwplug->zone_no, offset_in_zone: zwplug->wp_offset);
507	}
508
509	static bool disk_insert_zone_wplug(struct gendisk *disk,
510	struct blk_zone_wplug *zwplug)
511	{
512	struct blk_zone_wplug *zwplg;
513	unsigned long flags;
514	u8 *zones_cond;
515	unsigned int idx =
516	hash_32(val: zwplug->zone_no, bits: disk->zone_wplugs_hash_bits);
517
518	/*
519	* Add the new zone write plug to the hash table, but carefully as we
520	* are racing with other submission context, so we may already have a
521	* zone write plug for the same zone.
522	*/
523	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
524	hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
525	if (zwplg->zone_no == zwplug->zone_no) {
526	spin_unlock_irqrestore(lock: &disk->zone_wplugs_lock, flags);
527	return false;
528	}
529	}
530
531	/*
532	* Set the zone condition: if we do not yet have a zones_cond array
533	* attached to the disk, then this is a zone write plug insert from the
534	* first call to blk_revalidate_disk_zones(), in which case the zone is
535	* necessarilly in the active condition.
536	*/
537	zones_cond = rcu_dereference_check(disk->zones_cond,
538	lockdep_is_held(&disk->zone_wplugs_lock));
539	if (zones_cond)
540	zwplug->cond = zones_cond[zwplug->zone_no];
541	else
542	zwplug->cond = BLK_ZONE_COND_ACTIVE;
543
544	hlist_add_head_rcu(n: &zwplug->node, h: &disk->zone_wplugs_hash[idx]);
545	atomic_inc(v: &disk->nr_zone_wplugs);
546	spin_unlock_irqrestore(lock: &disk->zone_wplugs_lock, flags);
547
548	return true;
549	}
550
551	static struct blk_zone_wplug disk_get_hashed_zone_wplug(struct* gendisk *disk,
552	sector_t sector)
553	{
554	unsigned int zno = disk_zone_no(disk, sector);
555	unsigned int idx = hash_32(val: zno, bits: disk->zone_wplugs_hash_bits);
556	struct blk_zone_wplug *zwplug;
557
558	rcu_read_lock();
559
560	hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
561	if (zwplug->zone_no == zno &&
562	refcount_inc_not_zero(r: &zwplug->ref)) {
563	rcu_read_unlock();
564	return zwplug;
565	}
566	}
567
568	rcu_read_unlock();
569
570	return NULL;
571	}
572
573	static inline struct blk_zone_wplug disk_get_zone_wplug(struct* gendisk *disk,
574	sector_t sector)
575	{
576	if (!atomic_read(v: &disk->nr_zone_wplugs))
577	return NULL;
578
579	return disk_get_hashed_zone_wplug(disk, sector);
580	}
581
582	static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
583	{
584	struct blk_zone_wplug *zwplug =
585	container_of(rcu_head, struct blk_zone_wplug, rcu_head);
586
587	mempool_free(element: zwplug, pool: zwplug->disk->zone_wplugs_pool);
588	}
589
590	static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
591	{
592	if (refcount_dec_and_test(r: &zwplug->ref)) {
593	WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
594	WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
595	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
596
597	call_rcu(head: &zwplug->rcu_head, func: disk_free_zone_wplug_rcu);
598	}
599	}
600
601	static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
602	struct blk_zone_wplug *zwplug)
603	{
604	lockdep_assert_held(&zwplug->lock);
605
606	/ If the zone write plug was already removed, we are done. /
607	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
608	return false;
609
610	/ If the zone write plug is still plugged, it cannot be removed. /
611	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
612	return false;
613
614	/*
615	* Completions of BIOs with blk_zone_write_plug_bio_endio() may
616	* happen after handling a request completion with
617	* blk_zone_write_plug_finish_request() (e.g. with split BIOs
618	* that are chained). In such case, disk_zone_wplug_unplug_bio()
619	* should not attempt to remove the zone write plug until all BIO
620	* completions are seen. Check by looking at the zone write plug
621	* reference count, which is 2 when the plug is unused (one reference
622	* taken when the plug was allocated and another reference taken by the
623	* caller context).
624	*/
625	if (refcount_read(r: &zwplug->ref) > `2`)
626	return false;
627
628	/ We can remove zone write plugs for zones that are empty or full. /
629	return !zwplug->wp_offset \|\| disk_zone_wplug_is_full(disk, zwplug);
630	}
631
632	static void disk_remove_zone_wplug(struct gendisk *disk,
633	struct blk_zone_wplug *zwplug)
634	{
635	unsigned long flags;
636
637	/ If the zone write plug was already removed, we have nothing to do. /
638	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
639	return;
640
641	/*
642	* Mark the zone write plug as unhashed and drop the extra reference we
643	* took when the plug was inserted in the hash table. Also update the
644	* disk zone condition array with the current condition of the zone
645	* write plug.
646	*/
647	zwplug->flags \|= BLK_ZONE_WPLUG_UNHASHED;
648	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
649	blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
650	lockdep_is_held(&disk->zone_wplugs_lock)),
651	zno: zwplug->zone_no, cond: zwplug->cond);
652	hlist_del_init_rcu(n: &zwplug->node);
653	atomic_dec(v: &disk->nr_zone_wplugs);
654	spin_unlock_irqrestore(lock: &disk->zone_wplugs_lock, flags);
655	disk_put_zone_wplug(zwplug);
656	}
657
658	static void blk_zone_wplug_bio_work(struct work_struct *work);
659
660	/*
661	* Get a reference on the write plug for the zone containing @sector.
662	* If the plug does not exist, it is allocated and hashed.
663	* Return a pointer to the zone write plug with the plug spinlock held.
664	*/
665	static struct blk_zone_wplug disk_get_and_lock_zone_wplug(struct* gendisk *disk,
666	sector_t sector, gfp_t gfp_mask,
667	unsigned long *flags)
668	{
669	unsigned int zno = disk_zone_no(disk, sector);
670	struct blk_zone_wplug *zwplug;
671
672	again:
673	zwplug = disk_get_zone_wplug(disk, sector);
674	if (zwplug) {
675	/*
676	* Check that a BIO completion or a zone reset or finish
677	* operation has not already removed the zone write plug from
678	* the hash table and dropped its reference count. In such case,
679	* we need to get a new plug so start over from the beginning.
680	*/
681	spin_lock_irqsave(&zwplug->lock, *flags);
682	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
683	spin_unlock_irqrestore(lock: &zwplug->lock, flags: *flags);
684	disk_put_zone_wplug(zwplug);
685	goto again;
686	}
687	return zwplug;
688	}
689
690	/*
691	* Allocate and initialize a zone write plug with an extra reference
692	* so that it is not freed when the zone write plug becomes idle without
693	* the zone being full.
694	*/
695	zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
696	if (!zwplug)
697	return NULL;
698
699	INIT_HLIST_NODE(h: &zwplug->node);
700	refcount_set(r: &zwplug->ref, n: `2`);
701	spin_lock_init(&zwplug->lock);
702	zwplug->flags = `0`;
703	zwplug->zone_no = zno;
704	zwplug->wp_offset = bdev_offset_from_zone_start(bdev: disk->part0, sector);
705	bio_list_init(bl: &zwplug->bio_list);
706	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
707	zwplug->disk = disk;
708
709	spin_lock_irqsave(&zwplug->lock, *flags);
710
711	/*
712	* Insert the new zone write plug in the hash table. This can fail only
713	* if another context already inserted a plug. Retry from the beginning
714	* in such case.
715	*/
716	if (!disk_insert_zone_wplug(disk, zwplug)) {
717	spin_unlock_irqrestore(lock: &zwplug->lock, flags: *flags);
718	mempool_free(element: zwplug, pool: disk->zone_wplugs_pool);
719	goto again;
720	}
721
722	return zwplug;
723	}
724
725	static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
726	struct bio *bio)
727	{
728	struct request_queue *q = zwplug->disk->queue;
729
730	bio_clear_flag(bio, bit: BIO_ZONE_WRITE_PLUGGING);
731	bio_io_error(bio);
732	disk_put_zone_wplug(zwplug);
733	/ Drop the reference taken by disk_zone_wplug_add_bio(). /
734	blk_queue_exit(q);
735	}
736
737	/*
738	* Abort (fail) all plugged BIOs of a zone write plug.
739	*/
740	static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
741	{
742	struct bio *bio;
743
744	lockdep_assert_held(&zwplug->lock);
745
746	if (bio_list_empty(bl: &zwplug->bio_list))
747	return;
748
749	pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
750	zwplug->disk->disk_name, zwplug->zone_no);
751	while ((bio = bio_list_pop(bl: &zwplug->bio_list)))
752	blk_zone_wplug_bio_io_error(zwplug, bio);
753
754	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
755	}
756
757	/*
758	* Update a zone write plug condition based on the write pointer offset.
759	*/
760	static void disk_zone_wplug_update_cond(struct gendisk *disk,
761	struct blk_zone_wplug *zwplug)
762	{
763	lockdep_assert_held(&zwplug->lock);
764
765	if (disk_zone_wplug_is_full(disk, zwplug))
766	zwplug->cond = BLK_ZONE_COND_FULL;
767	else if (!zwplug->wp_offset)
768	zwplug->cond = BLK_ZONE_COND_EMPTY;
769	else
770	zwplug->cond = BLK_ZONE_COND_ACTIVE;
771	}
772
773	/*
774	* Set a zone write plug write pointer offset to the specified value.
775	* This aborts all plugged BIOs, which is fine as this function is called for
776	* a zone reset operation, a zone finish operation or if the zone needs a wp
777	* update from a report zone after a write error.
778	*/
779	static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
780	struct blk_zone_wplug *zwplug,
781	unsigned int wp_offset)
782	{
783	lockdep_assert_held(&zwplug->lock);
784
785	/ Update the zone write pointer and abort all plugged BIOs. /
786	zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
787	zwplug->wp_offset = wp_offset;
788	disk_zone_wplug_update_cond(disk, zwplug);
789
790	disk_zone_wplug_abort(zwplug);
791
792	/*
793	* The zone write plug now has no BIO plugged: remove it from the
794	* hash table so that it cannot be seen. The plug will be freed
795	* when the last reference is dropped.
796	*/
797	if (disk_should_remove_zone_wplug(disk, zwplug))
798	disk_remove_zone_wplug(disk, zwplug);
799	}
800
801	static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
802	{
803	switch (zone->cond) {
804	case BLK_ZONE_COND_IMP_OPEN:
805	case BLK_ZONE_COND_EXP_OPEN:
806	case BLK_ZONE_COND_CLOSED:
807	case BLK_ZONE_COND_ACTIVE:
808	return zone->wp - zone->start;
809	case BLK_ZONE_COND_EMPTY:
810	return `0`;
811	case BLK_ZONE_COND_FULL:
812	case BLK_ZONE_COND_NOT_WP:
813	case BLK_ZONE_COND_OFFLINE:
814	case BLK_ZONE_COND_READONLY:
815	default:
816	/*
817	* Conventional, full, offline and read-only zones do not have
818	* a valid write pointer.
819	*/
820	return UINT_MAX;
821	}
822	}
823
824	static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
825	struct blk_zone *zone)
826	{
827	struct blk_zone_wplug *zwplug;
828	unsigned int wp_offset = blk_zone_wp_offset(zone);
829
830	zwplug = disk_get_zone_wplug(disk, sector: zone->start);
831	if (zwplug) {
832	unsigned long flags;
833
834	spin_lock_irqsave(&zwplug->lock, flags);
835	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
836	disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
837	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
838	disk_put_zone_wplug(zwplug);
839	}
840
841	return wp_offset;
842	}
843
844	/**
845	* disk_report_zone - Report one zone
846	* @disk: Target disk
847	* @zone: The zone to report
848	* @idx: The index of the zone in the overall zone report
849	* @args: report zones callback and data
850	*
851	* Description:
852	* Helper function for block device drivers to report one zone of a zone
853	* report initiated with blkdev_report_zones(). The zone being reported is
854	* specified by @zone and used to update, if necessary, the zone write plug
855	* information for the zone. If @args specifies a user callback function,
856	* this callback is executed.
857	*/
858	int disk_report_zone(struct gendisk disk, struct* blk_zone *zone,
859	unsigned int idx, struct blk_report_zones_args *args)
860	{
861	if (args && args->report_active) {
862	/*
863	* If we come here, then this is a report zones as a fallback
864	* for a cached report. So collapse the implicit open, explicit
865	* open and closed conditions into the active zone condition.
866	*/
867	switch (zone->cond) {
868	case BLK_ZONE_COND_IMP_OPEN:
869	case BLK_ZONE_COND_EXP_OPEN:
870	case BLK_ZONE_COND_CLOSED:
871	zone->cond = BLK_ZONE_COND_ACTIVE;
872	break;
873	default:
874	break;
875	}
876	}
877
878	if (disk->zone_wplugs_hash)
879	disk_zone_wplug_sync_wp_offset(disk, zone);
880
881	if (args && args->cb)
882	return args->cb(zone, idx, args->data);
883
884	return `0`;
885	}
886	EXPORT_SYMBOL_GPL(disk_report_zone);
887
888	static int blkdev_report_zone_cb(struct blk_zone zone, unsigned* int idx,
889	void *data)
890	{
891	memcpy(data, zone, sizeof(struct blk_zone));
892	return `0`;
893	}
894
895	static int blkdev_report_zone_fallback(struct block_device *bdev,
896	sector_t sector, struct blk_zone *zone)
897	{
898	struct blk_report_zones_args args = {
899	.cb = blkdev_report_zone_cb,
900	.data = zone,
901	.report_active = true,
902	};
903	int error;
904
905	error = blkdev_do_report_zones(bdev, sector, nr_zones: `1`, args: &args);
906	if (error < `0`)
907	return error;
908	if (error == `0`)
909	return -EIO;
910	return `0`;
911	}
912
913	/*
914	* For devices that natively support zone append operations, we do not use zone
915	* write plugging for zone append writes, which makes the zone condition
916	* tracking invalid once zone append was used. In that case fall back to a
917	* regular report zones to get correct information.
918	*/
919	static inline bool blkdev_has_cached_report_zones(struct block_device *bdev)
920	{
921	return disk_need_zone_resources(disk: bdev->bd_disk) &&
922	(bdev_emulates_zone_append(bdev) \|\|
923	!test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state));
924	}
925
926	/**
927	* blkdev_get_zone_info - Get a single zone information from cached data
928	* @bdev: Target block device
929	* @sector: Sector contained by the target zone
930	* @zone: zone structure to return the zone information
931	*
932	* Description:
933	* Get the zone information for the zone containing @sector using the zone
934	* write plug of the target zone, if one exist, or the disk zone condition
935	* array otherwise. The zone condition may be reported as being
936	* the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
937	* open, explicit open or closed condition.
938	*
939	* Returns 0 on success and a negative error code on failure.
940	*/
941	int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
942	struct blk_zone *zone)
943	{
944	struct gendisk *disk = bdev->bd_disk;
945	sector_t zone_sectors = bdev_zone_sectors(bdev);
946	struct blk_zone_wplug *zwplug;
947	unsigned long flags;
948	u8 *zones_cond;
949
950	if (!bdev_is_zoned(bdev))
951	return -EOPNOTSUPP;
952
953	if (sector >= get_capacity(disk))
954	return -EINVAL;
955
956	memset(zone, `0`, sizeof(*zone));
957	sector = bdev_zone_start(bdev, sector);
958
959	if (!blkdev_has_cached_report_zones(bdev))
960	return blkdev_report_zone_fallback(bdev, sector, zone);
961
962	rcu_read_lock();
963	zones_cond = rcu_dereference(disk->zones_cond);
964	if (!disk->zone_wplugs_hash \|\| !zones_cond) {
965	rcu_read_unlock();
966	return blkdev_report_zone_fallback(bdev, sector, zone);
967	}
968	zone->cond = zones_cond[disk_zone_no(disk, sector)];
969	rcu_read_unlock();
970
971	zone->start = sector;
972	zone->len = zone_sectors;
973
974	/*
975	* If this is a conventional zone, we do not have a zone write plug and
976	* can report the zone immediately.
977	*/
978	if (zone->cond == BLK_ZONE_COND_NOT_WP) {
979	zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
980	zone->capacity = zone_sectors;
981	zone->wp = ULLONG_MAX;
982	return `0`;
983	}
984
985	/*
986	* This is a sequential write required zone. If the zone is read-only or
987	* offline, only set the zone write pointer to an invalid value and
988	* report the zone.
989	*/
990	zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
991	if (disk_zone_is_last(disk, zone))
992	zone->capacity = disk->last_zone_capacity;
993	else
994	zone->capacity = disk->zone_capacity;
995
996	if (zone->cond == BLK_ZONE_COND_READONLY \|\|
997	zone->cond == BLK_ZONE_COND_OFFLINE) {
998	zone->wp = ULLONG_MAX;
999	return `0`;
1000	}
1001
1002	/*
1003	* If the zone does not have a zone write plug, it is either full or
1004	* empty, as we otherwise would have a zone write plug for it. In this
1005	* case, set the write pointer accordingly and report the zone.
1006	* Otherwise, if we have a zone write plug, use it.
1007	*/
1008	zwplug = disk_get_zone_wplug(disk, sector);
1009	if (!zwplug) {
1010	if (zone->cond == BLK_ZONE_COND_FULL)
1011	zone->wp = ULLONG_MAX;
1012	else
1013	zone->wp = sector;
1014	return `0`;
1015	}
1016
1017	spin_lock_irqsave(&zwplug->lock, flags);
1018	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
1019	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1020	disk_put_zone_wplug(zwplug);
1021	return blkdev_report_zone_fallback(bdev, sector, zone);
1022	}
1023	zone->cond = zwplug->cond;
1024	zone->wp = sector + zwplug->wp_offset;
1025	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1026
1027	disk_put_zone_wplug(zwplug);
1028
1029	return `0`;
1030	}
1031	EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
1032
1033	/**
1034	* blkdev_report_zones_cached - Get cached zones information
1035	* @bdev: Target block device
1036	* @sector: Sector from which to report zones
1037	* @nr_zones: Maximum number of zones to report
1038	* @cb: Callback function called for each reported zone
1039	* @data: Private data for the callback function
1040	*
1041	* Description:
1042	* Similar to blkdev_report_zones() but instead of calling into the low level
1043	* device driver to get the zone report from the device, use
1044	* blkdev_get_zone_info() to generate the report from the disk zone write
1045	* plugs and zones condition array. Since calling this function without a
1046	* callback does not make sense, @cb must be specified.
1047	*/
1048	int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
1049	unsigned int nr_zones, report_zones_cb cb, void *data)
1050	{
1051	struct gendisk *disk = bdev->bd_disk;
1052	sector_t capacity = get_capacity(disk);
1053	sector_t zone_sectors = bdev_zone_sectors(bdev);
1054	unsigned int idx = `0`;
1055	struct blk_zone zone;
1056	int ret;
1057
1058	if (!cb \|\| !bdev_is_zoned(bdev) \|\|
1059	WARN_ON_ONCE(!disk->fops->report_zones))
1060	return -EOPNOTSUPP;
1061
1062	if (!nr_zones \|\| sector >= capacity)
1063	return `0`;
1064
1065	if (!blkdev_has_cached_report_zones(bdev)) {
1066	struct blk_report_zones_args args = {
1067	.cb = cb,
1068	.data = data,
1069	.report_active = true,
1070	};
1071
1072	return blkdev_do_report_zones(bdev, sector, nr_zones, args: &args);
1073	}
1074
1075	for (sector = bdev_zone_start(bdev, sector);
1076	sector < capacity && idx < nr_zones;
1077	sector += zone_sectors, idx++) {
1078	ret = blkdev_get_zone_info(bdev, sector, &zone);
1079	if (ret)
1080	return ret;
1081
1082	ret = cb(&zone, idx, data);
1083	if (ret)
1084	return ret;
1085	}
1086
1087	return idx;
1088	}
1089	EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
1090
1091	static void blk_zone_reset_bio_endio(struct bio *bio)
1092	{
1093	struct gendisk *disk = bio->bi_bdev->bd_disk;
1094	sector_t sector = bio->bi_iter.bi_sector;
1095	struct blk_zone_wplug *zwplug;
1096
1097	/*
1098	* If we have a zone write plug, set its write pointer offset to 0.
1099	* This will abort all BIOs plugged for the target zone. It is fine as
1100	* resetting zones while writes are still in-flight will result in the
1101	* writes failing anyway.
1102	*/
1103	zwplug = disk_get_zone_wplug(disk, sector);
1104	if (zwplug) {
1105	unsigned long flags;
1106
1107	spin_lock_irqsave(&zwplug->lock, flags);
1108	disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset: `0`);
1109	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1110	disk_put_zone_wplug(zwplug);
1111	} else {
1112	disk_zone_set_cond(disk, sector, cond: BLK_ZONE_COND_EMPTY);
1113	}
1114	}
1115
1116	static void blk_zone_reset_all_bio_endio(struct bio *bio)
1117	{
1118	struct gendisk *disk = bio->bi_bdev->bd_disk;
1119	sector_t capacity = get_capacity(disk);
1120	struct blk_zone_wplug *zwplug;
1121	unsigned long flags;
1122	sector_t sector;
1123	unsigned int i;
1124
1125	if (atomic_read(v: &disk->nr_zone_wplugs)) {
1126	/ Update the condition of all zone write plugs. /
1127	rcu_read_lock();
1128	for (i = `0`; i < disk_zone_wplugs_hash_size(disk); i++) {
1129	hlist_for_each_entry_rcu(zwplug,
1130	&disk->zone_wplugs_hash[i],
1131	node) {
1132	spin_lock_irqsave(&zwplug->lock, flags);
1133	disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset: `0`);
1134	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1135	}
1136	}
1137	rcu_read_unlock();
1138	}
1139
1140	/ Update the cached zone conditions. /
1141	for (sector = `0`; sector < capacity;
1142	sector += bdev_zone_sectors(bdev: bio->bi_bdev))
1143	disk_zone_set_cond(disk, sector, cond: BLK_ZONE_COND_EMPTY);
1144	clear_bit(GD_ZONE_APPEND_USED, addr: &disk->state);
1145	}
1146
1147	static void blk_zone_finish_bio_endio(struct bio *bio)
1148	{
1149	struct block_device *bdev = bio->bi_bdev;
1150	struct gendisk *disk = bdev->bd_disk;
1151	sector_t sector = bio->bi_iter.bi_sector;
1152	struct blk_zone_wplug *zwplug;
1153
1154	/*
1155	* If we have a zone write plug, set its write pointer offset to the
1156	* zone size. This will abort all BIOs plugged for the target zone. It
1157	* is fine as resetting zones while writes are still in-flight will
1158	* result in the writes failing anyway.
1159	*/
1160	zwplug = disk_get_zone_wplug(disk, sector);
1161	if (zwplug) {
1162	unsigned long flags;
1163
1164	spin_lock_irqsave(&zwplug->lock, flags);
1165	disk_zone_wplug_set_wp_offset(disk, zwplug,
1166	wp_offset: bdev_zone_sectors(bdev));
1167	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1168	disk_put_zone_wplug(zwplug);
1169	} else {
1170	disk_zone_set_cond(disk, sector, cond: BLK_ZONE_COND_FULL);
1171	}
1172	}
1173
1174	void blk_zone_mgmt_bio_endio(struct bio *bio)
1175	{
1176	/ If the BIO failed, we have nothing to do. /
1177	if (bio->bi_status != BLK_STS_OK)
1178	return;
1179
1180	switch (bio_op(bio)) {
1181	case REQ_OP_ZONE_RESET:
1182	blk_zone_reset_bio_endio(bio);
1183	return;
1184	case REQ_OP_ZONE_RESET_ALL:
1185	blk_zone_reset_all_bio_endio(bio);
1186	return;
1187	case REQ_OP_ZONE_FINISH:
1188	blk_zone_finish_bio_endio(bio);
1189	return;
1190	default:
1191	return;
1192	}
1193	}
1194
1195	static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
1196	struct blk_zone_wplug *zwplug)
1197	{
1198	lockdep_assert_held(&zwplug->lock);
1199
1200	/*
1201	* Take a reference on the zone write plug and schedule the submission
1202	* of the next plugged BIO. blk_zone_wplug_bio_work() will release the
1203	* reference we take here.
1204	*/
1205	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1206	refcount_inc(r: &zwplug->ref);
1207	queue_work(wq: disk->zone_wplugs_wq, work: &zwplug->bio_work);
1208	}
1209
1210	static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
1211	struct blk_zone_wplug *zwplug,
1212	struct bio bio, unsigned* int nr_segs)
1213	{
1214	/*
1215	* Grab an extra reference on the BIO request queue usage counter.
1216	* This reference will be reused to submit a request for the BIO for
1217	* blk-mq devices and dropped when the BIO is failed and after
1218	* it is issued in the case of BIO-based devices.
1219	*/
1220	percpu_ref_get(ref: &bio->bi_bdev->bd_disk->queue->q_usage_counter);
1221
1222	/*
1223	* The BIO is being plugged and thus will have to wait for the on-going
1224	* write and for all other writes already plugged. So polling makes
1225	* no sense.
1226	*/
1227	bio_clear_polled(bio);
1228
1229	/*
1230	* Reuse the poll cookie field to store the number of segments when
1231	* split to the hardware limits.
1232	*/
1233	bio->__bi_nr_segments = nr_segs;
1234
1235	/*
1236	* We always receive BIOs after they are split and ready to be issued.
1237	* The block layer passes the parts of a split BIO in order, and the
1238	* user must also issue write sequentially. So simply add the new BIO
1239	* at the tail of the list to preserve the sequential write order.
1240	*/
1241	bio_list_add(bl: &zwplug->bio_list, bio);
1242	trace_disk_zone_wplug_add_bio(q: zwplug->disk->queue, zno: zwplug->zone_no,
1243	sector: bio->bi_iter.bi_sector, bio_sectors(bio));
1244	}
1245
1246	/*
1247	* Called from bio_attempt_back_merge() when a BIO was merged with a request.
1248	*/
1249	void blk_zone_write_plug_bio_merged(struct bio *bio)
1250	{
1251	struct gendisk *disk = bio->bi_bdev->bd_disk;
1252	struct blk_zone_wplug *zwplug;
1253	unsigned long flags;
1254
1255	/*
1256	* If the BIO was already plugged, then we were called through
1257	* blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
1258	* For this case, we already hold a reference on the zone write plug for
1259	* the BIO and blk_zone_write_plug_init_request() will handle the
1260	* zone write pointer offset update.
1261	*/
1262	if (bio_flagged(bio, bit: BIO_ZONE_WRITE_PLUGGING))
1263	return;
1264
1265	bio_set_flag(bio, bit: BIO_ZONE_WRITE_PLUGGING);
1266
1267	/*
1268	* Get a reference on the zone write plug of the target zone and advance
1269	* the zone write pointer offset. Given that this is a merge, we already
1270	* have at least one request and one BIO referencing the zone write
1271	* plug. So this should not fail.
1272	*/
1273	zwplug = disk_get_zone_wplug(disk, sector: bio->bi_iter.bi_sector);
1274	if (WARN_ON_ONCE(!zwplug))
1275	return;
1276
1277	spin_lock_irqsave(&zwplug->lock, flags);
1278	zwplug->wp_offset += bio_sectors(bio);
1279	disk_zone_wplug_update_cond(disk, zwplug);
1280	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1281	}
1282
1283	/*
1284	* Attempt to merge plugged BIOs with a newly prepared request for a BIO that
1285	* already went through zone write plugging (either a new BIO or one that was
1286	* unplugged).
1287	*/
1288	void blk_zone_write_plug_init_request(struct request *req)
1289	{
1290	sector_t req_back_sector = blk_rq_pos(rq: req) + blk_rq_sectors(rq: req);
1291	struct request_queue *q = req->q;
1292	struct gendisk *disk = q->disk;
1293	struct blk_zone_wplug *zwplug =
1294	disk_get_zone_wplug(disk, sector: blk_rq_pos(rq: req));
1295	unsigned long flags;
1296	struct bio *bio;
1297
1298	if (WARN_ON_ONCE(!zwplug))
1299	return;
1300
1301	/*
1302	* Indicate that completion of this request needs to be handled with
1303	* blk_zone_write_plug_finish_request(), which will drop the reference
1304	* on the zone write plug we took above on entry to this function.
1305	*/
1306	req->rq_flags \|= RQF_ZONE_WRITE_PLUGGING;
1307
1308	if (blk_queue_nomerges(q))
1309	return;
1310
1311	/*
1312	* Walk through the list of plugged BIOs to check if they can be merged
1313	* into the back of the request.
1314	*/
1315	spin_lock_irqsave(&zwplug->lock, flags);
1316	while (!disk_zone_wplug_is_full(disk, zwplug)) {
1317	bio = bio_list_peek(bl: &zwplug->bio_list);
1318	if (!bio)
1319	break;
1320
1321	if (bio->bi_iter.bi_sector != req_back_sector \|\|
1322	!blk_rq_merge_ok(rq: req, bio))
1323	break;
1324
1325	WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
1326	!bio->__bi_nr_segments);
1327
1328	bio_list_pop(bl: &zwplug->bio_list);
1329	if (bio_attempt_back_merge(req, bio, nr_segs: bio->__bi_nr_segments) !=
1330	BIO_MERGE_OK) {
1331	bio_list_add_head(bl: &zwplug->bio_list, bio);
1332	break;
1333	}
1334
1335	/ Drop the reference taken by disk_zone_wplug_add_bio(). /
1336	blk_queue_exit(q);
1337	zwplug->wp_offset += bio_sectors(bio);
1338	disk_zone_wplug_update_cond(disk, zwplug);
1339
1340	req_back_sector += bio_sectors(bio);
1341	}
1342	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1343	}
1344
1345	/*
1346	* Check and prepare a BIO for submission by incrementing the write pointer
1347	* offset of its zone write plug and changing zone append operations into
1348	* regular write when zone append emulation is needed.
1349	*/
1350	static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
1351	struct bio *bio)
1352	{
1353	struct gendisk *disk = bio->bi_bdev->bd_disk;
1354
1355	lockdep_assert_held(&zwplug->lock);
1356
1357	/*
1358	* If we lost track of the zone write pointer due to a write error,
1359	* the user must either execute a report zones, reset the zone or finish
1360	* the to recover a reliable write pointer position. Fail BIOs if the
1361	* user did not do that as we cannot handle emulated zone append
1362	* otherwise.
1363	*/
1364	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
1365	return false;
1366
1367	/*
1368	* Check that the user is not attempting to write to a full zone.
1369	* We know such BIO will fail, and that would potentially overflow our
1370	* write pointer offset beyond the end of the zone.
1371	*/
1372	if (disk_zone_wplug_is_full(disk, zwplug))
1373	return false;
1374
1375	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1376	/*
1377	* Use a regular write starting at the current write pointer.
1378	* Similarly to native zone append operations, do not allow
1379	* merging.
1380	*/
1381	bio->bi_opf &= ~REQ_OP_MASK;
1382	bio->bi_opf \|= REQ_OP_WRITE \| REQ_NOMERGE;
1383	bio->bi_iter.bi_sector += zwplug->wp_offset;
1384
1385	/*
1386	* Remember that this BIO is in fact a zone append operation
1387	* so that we can restore its operation code on completion.
1388	*/
1389	bio_set_flag(bio, bit: BIO_EMULATES_ZONE_APPEND);
1390	} else {
1391	/*
1392	* Check for non-sequential writes early as we know that BIOs
1393	* with a start sector not unaligned to the zone write pointer
1394	* will fail.
1395	*/
1396	if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
1397	return false;
1398	}
1399
1400	/ Advance the zone write pointer offset. /
1401	zwplug->wp_offset += bio_sectors(bio);
1402	disk_zone_wplug_update_cond(disk, zwplug);
1403
1404	return true;
1405	}
1406
1407	static bool blk_zone_wplug_handle_write(struct bio bio, unsigned* int nr_segs)
1408	{
1409	struct gendisk *disk = bio->bi_bdev->bd_disk;
1410	sector_t sector = bio->bi_iter.bi_sector;
1411	struct blk_zone_wplug *zwplug;
1412	gfp_t gfp_mask = GFP_NOIO;
1413	unsigned long flags;
1414
1415	/*
1416	* BIOs must be fully contained within a zone so that we use the correct
1417	* zone write plug for the entire BIO. For blk-mq devices, the block
1418	* layer should already have done any splitting required to ensure this
1419	* and this BIO should thus not be straddling zone boundaries. For
1420	* BIO-based devices, it is the responsibility of the driver to split
1421	* the bio before submitting it.
1422	*/
1423	if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1424	bio_io_error(bio);
1425	return true;
1426	}
1427
1428	/ Conventional zones do not need write plugging. /
1429	if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1430	/ Zone append to conventional zones is not allowed. /
1431	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1432	bio_io_error(bio);
1433	return true;
1434	}
1435	return false;
1436	}
1437
1438	if (bio->bi_opf & REQ_NOWAIT)
1439	gfp_mask = GFP_NOWAIT;
1440
1441	zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, flags: &flags);
1442	if (!zwplug) {
1443	if (bio->bi_opf & REQ_NOWAIT)
1444	bio_wouldblock_error(bio);
1445	else
1446	bio_io_error(bio);
1447	return true;
1448	}
1449
1450	/ Indicate that this BIO is being handled using zone write plugging. /
1451	bio_set_flag(bio, bit: BIO_ZONE_WRITE_PLUGGING);
1452
1453	/*
1454	* Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a
1455	* BLK_STS_AGAIN failure if we let the caller submit the BIO.
1456	*/
1457	if (bio->bi_opf & REQ_NOWAIT) {
1458	bio->bi_opf &= ~REQ_NOWAIT;
1459	goto queue_bio;
1460	}
1461
1462	/ If the zone is already plugged, add the BIO to the BIO plug list. /
1463	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
1464	goto queue_bio;
1465
1466	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1467	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1468	bio_io_error(bio);
1469	return true;
1470	}
1471
1472	/ Otherwise, plug and let the caller submit the BIO. /
1473	zwplug->flags \|= BLK_ZONE_WPLUG_PLUGGED;
1474
1475	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1476
1477	return false;
1478
1479	queue_bio:
1480	disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1481
1482	if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
1483	zwplug->flags \|= BLK_ZONE_WPLUG_PLUGGED;
1484	disk_zone_wplug_schedule_bio_work(disk, zwplug);
1485	}
1486
1487	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1488
1489	return true;
1490	}
1491
1492	static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1493	{
1494	struct gendisk *disk = bio->bi_bdev->bd_disk;
1495	struct blk_zone_wplug *zwplug;
1496	unsigned long flags;
1497
1498	if (!test_bit(GD_ZONE_APPEND_USED, &disk->state))
1499	set_bit(GD_ZONE_APPEND_USED, addr: &disk->state);
1500
1501	/*
1502	* We have native support for zone append operations, so we are not
1503	* going to handle @bio through plugging. However, we may already have a
1504	* zone write plug for the target zone if that zone was previously
1505	* partially written using regular writes. In such case, we risk leaving
1506	* the plug in the disk hash table if the zone is fully written using
1507	* zone append operations. Avoid this by removing the zone write plug.
1508	*/
1509	zwplug = disk_get_zone_wplug(disk, sector: bio->bi_iter.bi_sector);
1510	if (likely(!zwplug))
1511	return;
1512
1513	spin_lock_irqsave(&zwplug->lock, flags);
1514
1515	/*
1516	* We are about to remove the zone write plug. But if the user
1517	* (mistakenly) has issued regular writes together with native zone
1518	* append, we must aborts the writes as otherwise the plugged BIOs would
1519	* not be executed by the plug BIO work as disk_get_zone_wplug() will
1520	* return NULL after the plug is removed. Aborting the plugged write
1521	* BIOs is consistent with the fact that these writes will most likely
1522	* fail anyway as there is no ordering guarantees between zone append
1523	* operations and regular write operations.
1524	*/
1525	if (!bio_list_empty(bl: &zwplug->bio_list)) {
1526	pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1527	disk->disk_name, zwplug->zone_no);
1528	disk_zone_wplug_abort(zwplug);
1529	}
1530	disk_remove_zone_wplug(disk, zwplug);
1531	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1532
1533	disk_put_zone_wplug(zwplug);
1534	}
1535
1536	static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio)
1537	{
1538	if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL &&
1539	!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
1540	/*
1541	* Zone reset and zone finish operations do not apply to
1542	* conventional zones.
1543	*/
1544	bio_io_error(bio);
1545	return true;
1546	}
1547
1548	/*
1549	* No-wait zone management BIOs do not make much sense as the callers
1550	* issue these as blocking operations in most cases. To avoid issues
1551	* with the BIO execution potentially failing with BLK_STS_AGAIN, warn
1552	* about REQ_NOWAIT being set and ignore that flag.
1553	*/
1554	if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
1555	bio->bi_opf &= ~REQ_NOWAIT;
1556
1557	return false;
1558	}
1559
1560	/**
1561	* blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1562	* @bio: The BIO being submitted
1563	* @nr_segs: The number of physical segments of @bio
1564	*
1565	* Handle write, write zeroes and zone append operations requiring emulation
1566	* using zone write plugging.
1567	*
1568	* Return true whenever @bio execution needs to be delayed through the zone
1569	* write plug. Otherwise, return false to let the submission path process
1570	* @bio normally.
1571	*/
1572	bool blk_zone_plug_bio(struct bio bio, unsigned* int nr_segs)
1573	{
1574	struct block_device *bdev = bio->bi_bdev;
1575
1576	if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
1577	return false;
1578
1579	/*
1580	* Regular writes and write zeroes need to be handled through the target
1581	* zone write plug. This includes writes with REQ_FUA \| REQ_PREFLUSH
1582	* which may need to go through the flush machinery depending on the
1583	* target device capabilities. Plugging such writes is fine as the flush
1584	* machinery operates at the request level, below the plug, and
1585	* completion of the flush sequence will go through the regular BIO
1586	* completion, which will handle zone write plugging.
1587	* Zone append operations for devices that requested emulation must
1588	* also be plugged so that these BIOs can be changed into regular
1589	* write BIOs.
1590	* Zone reset, reset all and finish commands need special treatment
1591	* to correctly track the write pointer offset of zones. These commands
1592	* are not plugged as we do not need serialization with write
1593	* operations. It is the responsibility of the user to not issue reset
1594	* and finish commands when write operations are in flight.
1595	*/
1596	switch (bio_op(bio)) {
1597	case REQ_OP_ZONE_APPEND:
1598	if (!bdev_emulates_zone_append(bdev)) {
1599	blk_zone_wplug_handle_native_zone_append(bio);
1600	return false;
1601	}
1602	fallthrough;
1603	case REQ_OP_WRITE:
1604	case REQ_OP_WRITE_ZEROES:
1605	return blk_zone_wplug_handle_write(bio, nr_segs);
1606	case REQ_OP_ZONE_RESET:
1607	case REQ_OP_ZONE_FINISH:
1608	case REQ_OP_ZONE_RESET_ALL:
1609	return blk_zone_wplug_handle_zone_mgmt(bio);
1610	default:
1611	return false;
1612	}
1613
1614	return false;
1615	}
1616	EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1617
1618	static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1619	struct blk_zone_wplug *zwplug)
1620	{
1621	unsigned long flags;
1622
1623	spin_lock_irqsave(&zwplug->lock, flags);
1624
1625	/ Schedule submission of the next plugged BIO if we have one. /
1626	if (!bio_list_empty(bl: &zwplug->bio_list)) {
1627	disk_zone_wplug_schedule_bio_work(disk, zwplug);
1628	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1629	return;
1630	}
1631
1632	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1633
1634	/*
1635	* If the zone is full (it was fully written or finished, or empty
1636	* (it was reset), remove its zone write plug from the hash table.
1637	*/
1638	if (disk_should_remove_zone_wplug(disk, zwplug))
1639	disk_remove_zone_wplug(disk, zwplug);
1640
1641	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1642	}
1643
1644	void blk_zone_append_update_request_bio(struct request rq, struct* bio *bio)
1645	{
1646	/*
1647	* For zone append requests, the request sector indicates the location
1648	* at which the BIO data was written. Return this value to the BIO
1649	* issuer through the BIO iter sector.
1650	* For plugged zone writes, which include emulated zone append, we need
1651	* the original BIO sector so that blk_zone_write_plug_bio_endio() can
1652	* lookup the zone write plug.
1653	*/
1654	bio->bi_iter.bi_sector = rq->__sector;
1655	trace_blk_zone_append_update_request_bio(rq);
1656	}
1657
1658	void blk_zone_write_plug_bio_endio(struct bio *bio)
1659	{
1660	struct gendisk *disk = bio->bi_bdev->bd_disk;
1661	struct blk_zone_wplug *zwplug =
1662	disk_get_zone_wplug(disk, sector: bio->bi_iter.bi_sector);
1663	unsigned long flags;
1664
1665	if (WARN_ON_ONCE(!zwplug))
1666	return;
1667
1668	/ Make sure we do not see this BIO again by clearing the plug flag. /
1669	bio_clear_flag(bio, bit: BIO_ZONE_WRITE_PLUGGING);
1670
1671	/*
1672	* If this is a regular write emulating a zone append operation,
1673	* restore the original operation code.
1674	*/
1675	if (bio_flagged(bio, bit: BIO_EMULATES_ZONE_APPEND)) {
1676	bio->bi_opf &= ~REQ_OP_MASK;
1677	bio->bi_opf \|= REQ_OP_ZONE_APPEND;
1678	bio_clear_flag(bio, bit: BIO_EMULATES_ZONE_APPEND);
1679	}
1680
1681	/*
1682	* If the BIO failed, abort all plugged BIOs and mark the plug as
1683	* needing a write pointer update.
1684	*/
1685	if (bio->bi_status != BLK_STS_OK) {
1686	spin_lock_irqsave(&zwplug->lock, flags);
1687	disk_zone_wplug_abort(zwplug);
1688	zwplug->flags \|= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1689	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1690	}
1691
1692	/ Drop the reference we took when the BIO was issued. /
1693	disk_put_zone_wplug(zwplug);
1694
1695	/*
1696	* For BIO-based devices, blk_zone_write_plug_finish_request()
1697	* is not called. So we need to schedule execution of the next
1698	* plugged BIO here.
1699	*/
1700	if (bdev_test_flag(bdev: bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1701	disk_zone_wplug_unplug_bio(disk, zwplug);
1702
1703	/ Drop the reference we took when entering this function. /
1704	disk_put_zone_wplug(zwplug);
1705	}
1706
1707	void blk_zone_write_plug_finish_request(struct request *req)
1708	{
1709	struct gendisk *disk = req->q->disk;
1710	struct blk_zone_wplug *zwplug;
1711
1712	zwplug = disk_get_zone_wplug(disk, sector: req->__sector);
1713	if (WARN_ON_ONCE(!zwplug))
1714	return;
1715
1716	req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1717
1718	/*
1719	* Drop the reference we took when the request was initialized in
1720	* blk_zone_write_plug_init_request().
1721	*/
1722	disk_put_zone_wplug(zwplug);
1723
1724	disk_zone_wplug_unplug_bio(disk, zwplug);
1725
1726	/ Drop the reference we took when entering this function. /
1727	disk_put_zone_wplug(zwplug);
1728	}
1729
1730	static void blk_zone_wplug_bio_work(struct work_struct *work)
1731	{
1732	struct blk_zone_wplug *zwplug =
1733	container_of(work, struct blk_zone_wplug, bio_work);
1734	struct block_device *bdev;
1735	unsigned long flags;
1736	struct bio *bio;
1737	bool prepared;
1738
1739	/*
1740	* Submit the next plugged BIO. If we do not have any, clear
1741	* the plugged flag.
1742	*/
1743	again:
1744	spin_lock_irqsave(&zwplug->lock, flags);
1745	bio = bio_list_pop(bl: &zwplug->bio_list);
1746	if (!bio) {
1747	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1748	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1749	goto put_zwplug;
1750	}
1751
1752	trace_blk_zone_wplug_bio(q: zwplug->disk->queue, zno: zwplug->zone_no,
1753	sector: bio->bi_iter.bi_sector, bio_sectors(bio));
1754
1755	prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
1756	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
1757
1758	if (!prepared) {
1759	blk_zone_wplug_bio_io_error(zwplug, bio);
1760	goto again;
1761	}
1762
1763	bdev = bio->bi_bdev;
1764
1765	/*
1766	* blk-mq devices will reuse the extra reference on the request queue
1767	* usage counter we took when the BIO was plugged, but the submission
1768	* path for BIO-based devices will not do that. So drop this extra
1769	* reference here.
1770	*/
1771	if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
1772	bdev->bd_disk->fops->submit_bio(bio);
1773	blk_queue_exit(q: bdev->bd_disk->queue);
1774	} else {
1775	blk_mq_submit_bio(bio);
1776	}
1777
1778	put_zwplug:
1779	/ Drop the reference we took in disk_zone_wplug_schedule_bio_work(). /
1780	disk_put_zone_wplug(zwplug);
1781	}
1782
1783	void disk_init_zone_resources(struct gendisk *disk)
1784	{
1785	spin_lock_init(&disk->zone_wplugs_lock);
1786	}
1787
1788	/*
1789	* For the size of a disk zone write plug hash table, use the size of the
1790	* zone write plug mempool, which is the maximum of the disk open zones and
1791	* active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1792	* 9 bits. For a disk that has no limits, mempool size defaults to 128.
1793	*/
1794	#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
1795	#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
1796
1797	static int disk_alloc_zone_resources(struct gendisk *disk,
1798	unsigned int pool_size)
1799	{
1800	unsigned int i;
1801
1802	atomic_set(v: &disk->nr_zone_wplugs, i: `0`);
1803	disk->zone_wplugs_hash_bits =
1804	min(ilog2(pool_size) + `1`, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1805
1806	disk->zone_wplugs_hash =
1807	kcalloc(disk_zone_wplugs_hash_size(disk),
1808	sizeof(struct hlist_head), GFP_KERNEL);
1809	if (!disk->zone_wplugs_hash)
1810	return -ENOMEM;
1811
1812	for (i = `0`; i < disk_zone_wplugs_hash_size(disk); i++)
1813	INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1814
1815	disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1816	sizeof(struct blk_zone_wplug));
1817	if (!disk->zone_wplugs_pool)
1818	goto free_hash;
1819
1820	disk->zone_wplugs_wq =
1821	alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM \| WQ_HIGHPRI,
1822	pool_size, disk->disk_name);
1823	if (!disk->zone_wplugs_wq)
1824	goto destroy_pool;
1825
1826	return `0`;
1827
1828	destroy_pool:
1829	mempool_destroy(pool: disk->zone_wplugs_pool);
1830	disk->zone_wplugs_pool = NULL;
1831	free_hash:
1832	kfree(objp: disk->zone_wplugs_hash);
1833	disk->zone_wplugs_hash = NULL;
1834	disk->zone_wplugs_hash_bits = `0`;
1835	return -ENOMEM;
1836	}
1837
1838	static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1839	{
1840	struct blk_zone_wplug *zwplug;
1841	unsigned int i;
1842
1843	if (!disk->zone_wplugs_hash)
1844	return;
1845
1846	/ Free all the zone write plugs we have. /
1847	for (i = `0`; i < disk_zone_wplugs_hash_size(disk); i++) {
1848	while (!hlist_empty(h: &disk->zone_wplugs_hash[i])) {
1849	zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1850	struct blk_zone_wplug, node);
1851	refcount_inc(r: &zwplug->ref);
1852	disk_remove_zone_wplug(disk, zwplug);
1853	disk_put_zone_wplug(zwplug);
1854	}
1855	}
1856
1857	WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1858	kfree(objp: disk->zone_wplugs_hash);
1859	disk->zone_wplugs_hash = NULL;
1860	disk->zone_wplugs_hash_bits = `0`;
1861
1862	/*
1863	* Wait for the zone write plugs to be RCU-freed before destroying the
1864	* mempool.
1865	*/
1866	rcu_barrier();
1867	mempool_destroy(pool: disk->zone_wplugs_pool);
1868	disk->zone_wplugs_pool = NULL;
1869	}
1870
1871	static void disk_set_zones_cond_array(struct gendisk disk, u8 zones_cond)
1872	{
1873	unsigned long flags;
1874
1875	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1876	zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
1877	lockdep_is_held(&disk->zone_wplugs_lock));
1878	spin_unlock_irqrestore(lock: &disk->zone_wplugs_lock, flags);
1879
1880	kfree_rcu_mightsleep(zones_cond);
1881	}
1882
1883	void disk_free_zone_resources(struct gendisk *disk)
1884	{
1885	if (disk->zone_wplugs_wq) {
1886	destroy_workqueue(wq: disk->zone_wplugs_wq);
1887	disk->zone_wplugs_wq = NULL;
1888	}
1889
1890	disk_destroy_zone_wplugs_hash_table(disk);
1891
1892	disk_set_zones_cond_array(disk, NULL);
1893	disk->zone_capacity = `0`;
1894	disk->last_zone_capacity = `0`;
1895	disk->nr_zones = `0`;
1896	}
1897
1898	struct blk_revalidate_zone_args {
1899	struct gendisk *disk;
1900	u8 *zones_cond;
1901	unsigned int nr_zones;
1902	unsigned int nr_conv_zones;
1903	unsigned int zone_capacity;
1904	unsigned int last_zone_capacity;
1905	sector_t sector;
1906	};
1907
1908	static int disk_revalidate_zone_resources(struct gendisk *disk,
1909	struct blk_revalidate_zone_args *args)
1910	{
1911	struct queue_limits *lim = &disk->queue->limits;
1912	unsigned int pool_size;
1913
1914	args->disk = disk;
1915	args->nr_zones =
1916	DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
1917
1918	/ Cached zone conditions: 1 byte per zone /
1919	args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
1920	if (!args->zones_cond)
1921	return -ENOMEM;
1922
1923	if (!disk_need_zone_resources(disk))
1924	return `0`;
1925
1926	/*
1927	* If the device has no limit on the maximum number of open and active
1928	* zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
1929	*/
1930	pool_size = max(lim->max_open_zones, lim->max_active_zones);
1931	if (!pool_size)
1932	pool_size =
1933	min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
1934
1935	if (!disk->zone_wplugs_hash)
1936	return disk_alloc_zone_resources(disk, pool_size);
1937
1938	return `0`;
1939	}
1940
1941	/*
1942	* Update the disk zone resources information and device queue limits.
1943	* The disk queue is frozen when this is executed.
1944	*/
1945	static int disk_update_zone_resources(struct gendisk *disk,
1946	struct blk_revalidate_zone_args *args)
1947	{
1948	struct request_queue *q = disk->queue;
1949	unsigned int nr_seq_zones;
1950	unsigned int pool_size, memflags;
1951	struct queue_limits lim;
1952	int ret = `0`;
1953
1954	lim = queue_limits_start_update(q);
1955
1956	memflags = blk_mq_freeze_queue(q);
1957
1958	disk->nr_zones = args->nr_zones;
1959	if (args->nr_conv_zones >= disk->nr_zones) {
1960	queue_limits_cancel_update(q);
1961	pr_warn("%s: Invalid number of conventional zones %u / %u\n",
1962	disk->disk_name, args->nr_conv_zones, disk->nr_zones);
1963	ret = -ENODEV;
1964	goto unfreeze;
1965	}
1966
1967	disk->zone_capacity = args->zone_capacity;
1968	disk->last_zone_capacity = args->last_zone_capacity;
1969	disk_set_zones_cond_array(disk, zones_cond: args->zones_cond);
1970
1971	/*
1972	* Some devices can advertise zone resource limits that are larger than
1973	* the number of sequential zones of the zoned block device, e.g. a
1974	* small ZNS namespace. For such case, assume that the zoned device has
1975	* no zone resource limits.
1976	*/
1977	nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
1978	if (lim.max_open_zones >= nr_seq_zones)
1979	lim.max_open_zones = `0`;
1980	if (lim.max_active_zones >= nr_seq_zones)
1981	lim.max_active_zones = `0`;
1982
1983	if (!disk->zone_wplugs_pool)
1984	goto commit;
1985
1986	/*
1987	* If the device has no limit on the maximum number of open and active
1988	* zones, set its max open zone limit to the mempool size to indicate
1989	* to the user that there is a potential performance impact due to
1990	* dynamic zone write plug allocation when simultaneously writing to
1991	* more zones than the size of the mempool.
1992	*/
1993	pool_size = max(lim.max_open_zones, lim.max_active_zones);
1994	if (!pool_size)
1995	pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
1996
1997	mempool_resize(pool: disk->zone_wplugs_pool, new_min_nr: pool_size);
1998
1999	if (!lim.max_open_zones && !lim.max_active_zones) {
2000	if (pool_size < nr_seq_zones)
2001	lim.max_open_zones = pool_size;
2002	else
2003	lim.max_open_zones = `0`;
2004	}
2005
2006	commit:
2007	ret = queue_limits_commit_update(q, lim: &lim);
2008
2009	unfreeze:
2010	if (ret)
2011	disk_free_zone_resources(disk);
2012
2013	blk_mq_unfreeze_queue(q, memflags);
2014
2015	return ret;
2016	}
2017
2018	static int blk_revalidate_zone_cond(struct blk_zone zone, unsigned* int idx,
2019	struct blk_revalidate_zone_args *args)
2020	{
2021	enum blk_zone_cond cond = zone->cond;
2022
2023	/ Check that the zone condition is consistent with the zone type. /
2024	switch (cond) {
2025	case BLK_ZONE_COND_NOT_WP:
2026	if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
2027	goto invalid_condition;
2028	break;
2029	case BLK_ZONE_COND_IMP_OPEN:
2030	case BLK_ZONE_COND_EXP_OPEN:
2031	case BLK_ZONE_COND_CLOSED:
2032	case BLK_ZONE_COND_EMPTY:
2033	case BLK_ZONE_COND_FULL:
2034	case BLK_ZONE_COND_OFFLINE:
2035	case BLK_ZONE_COND_READONLY:
2036	if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
2037	goto invalid_condition;
2038	break;
2039	default:
2040	pr_warn("%s: Invalid zone condition 0x%X\n",
2041	args->disk->disk_name, cond);
2042	return -ENODEV;
2043	}
2044
2045	blk_zone_set_cond(zones_cond: args->zones_cond, zno: idx, cond);
2046
2047	return `0`;
2048
2049	invalid_condition:
2050	pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
2051	args->disk->disk_name, cond, zone->type);
2052
2053	return -ENODEV;
2054	}
2055
2056	static int blk_revalidate_conv_zone(struct blk_zone zone, unsigned* int idx,
2057	struct blk_revalidate_zone_args *args)
2058	{
2059	struct gendisk *disk = args->disk;
2060
2061	if (zone->capacity != zone->len) {
2062	pr_warn("%s: Invalid conventional zone capacity\n",
2063	disk->disk_name);
2064	return -ENODEV;
2065	}
2066
2067	if (disk_zone_is_last(disk, zone))
2068	args->last_zone_capacity = zone->capacity;
2069
2070	args->nr_conv_zones++;
2071
2072	return `0`;
2073	}
2074
2075	static int blk_revalidate_seq_zone(struct blk_zone zone, unsigned* int idx,
2076	struct blk_revalidate_zone_args *args)
2077	{
2078	struct gendisk *disk = args->disk;
2079	struct blk_zone_wplug *zwplug;
2080	unsigned int wp_offset;
2081	unsigned long flags;
2082
2083	/*
2084	* Remember the capacity of the first sequential zone and check
2085	* if it is constant for all zones, ignoring the last zone as it can be
2086	* smaller.
2087	*/
2088	if (!args->zone_capacity)
2089	args->zone_capacity = zone->capacity;
2090	if (disk_zone_is_last(disk, zone)) {
2091	args->last_zone_capacity = zone->capacity;
2092	} else if (zone->capacity != args->zone_capacity) {
2093	pr_warn("%s: Invalid variable zone capacity\n",
2094	disk->disk_name);
2095	return -ENODEV;
2096	}
2097
2098	/*
2099	* If the device needs zone append emulation, we need to track the
2100	* write pointer of all zones that are not empty nor full. So make sure
2101	* we have a zone write plug for such zone if the device has a zone
2102	* write plug hash table.
2103	*/
2104	if (!disk->zone_wplugs_hash)
2105	return `0`;
2106
2107	wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone);
2108	if (!wp_offset \|\| wp_offset >= zone->capacity)
2109	return `0`;
2110
2111	zwplug = disk_get_and_lock_zone_wplug(disk, sector: zone->wp, GFP_NOIO, flags: &flags);
2112	if (!zwplug)
2113	return -ENOMEM;
2114	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
2115	disk_put_zone_wplug(zwplug);
2116
2117	return `0`;
2118	}
2119
2120	/*
2121	* Helper function to check the validity of zones of a zoned block device.
2122	*/
2123	static int blk_revalidate_zone_cb(struct blk_zone zone, unsigned* int idx,
2124	void *data)
2125	{
2126	struct blk_revalidate_zone_args *args = data;
2127	struct gendisk *disk = args->disk;
2128	sector_t zone_sectors = disk->queue->limits.chunk_sectors;
2129	int ret;
2130
2131	/ Check for bad zones and holes in the zone report /
2132	if (zone->start != args->sector) {
2133	pr_warn("%s: Zone gap at sectors %llu..%llu\n",
2134	disk->disk_name, args->sector, zone->start);
2135	return -ENODEV;
2136	}
2137
2138	if (zone->start >= get_capacity(disk) \|\| !zone->len) {
2139	pr_warn("%s: Invalid zone start %llu, length %llu\n",
2140	disk->disk_name, zone->start, zone->len);
2141	return -ENODEV;
2142	}
2143
2144	/*
2145	* All zones must have the same size, with the exception on an eventual
2146	* smaller last zone.
2147	*/
2148	if (!disk_zone_is_last(disk, zone)) {
2149	if (zone->len != zone_sectors) {
2150	pr_warn("%s: Invalid zoned device with non constant zone size\n",
2151	disk->disk_name);
2152	return -ENODEV;
2153	}
2154	} else if (zone->len > zone_sectors) {
2155	pr_warn("%s: Invalid zoned device with larger last zone size\n",
2156	disk->disk_name);
2157	return -ENODEV;
2158	}
2159
2160	if (!zone->capacity \|\| zone->capacity > zone->len) {
2161	pr_warn("%s: Invalid zone capacity\n",
2162	disk->disk_name);
2163	return -ENODEV;
2164	}
2165
2166	/ Check zone condition /
2167	ret = blk_revalidate_zone_cond(zone, idx, args);
2168	if (ret)
2169	return ret;
2170
2171	/ Check zone type /
2172	switch (zone->type) {
2173	case BLK_ZONE_TYPE_CONVENTIONAL:
2174	ret = blk_revalidate_conv_zone(zone, idx, args);
2175	break;
2176	case BLK_ZONE_TYPE_SEQWRITE_REQ:
2177	ret = blk_revalidate_seq_zone(zone, idx, args);
2178	break;
2179	case BLK_ZONE_TYPE_SEQWRITE_PREF:
2180	default:
2181	pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
2182	disk->disk_name, (int)zone->type, zone->start);
2183	ret = -ENODEV;
2184	}
2185
2186	if (!ret)
2187	args->sector += zone->len;
2188
2189	return ret;
2190	}
2191
2192	/**
2193	* blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
2194	* @disk: Target disk
2195	*
2196	* Helper function for low-level device drivers to check, (re) allocate and
2197	* initialize resources used for managing zoned disks. This function should
2198	* normally be called by blk-mq based drivers when a zoned gendisk is probed
2199	* and when the zone configuration of the gendisk changes (e.g. after a format).
2200	* Before calling this function, the device driver must already have set the
2201	* device zone size (chunk_sector limit) and the max zone append limit.
2202	* BIO based drivers can also use this function as long as the device queue
2203	* can be safely frozen.
2204	*/
2205	int blk_revalidate_disk_zones(struct gendisk *disk)
2206	{
2207	struct request_queue *q = disk->queue;
2208	sector_t zone_sectors = q->limits.chunk_sectors;
2209	sector_t capacity = get_capacity(disk);
2210	struct blk_revalidate_zone_args args = { };
2211	unsigned int memflags, noio_flag;
2212	struct blk_report_zones_args rep_args = {
2213	.cb = blk_revalidate_zone_cb,
2214	.data = &args,
2215	};
2216	int ret = -ENOMEM;
2217
2218	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
2219	return -EIO;
2220
2221	if (!capacity)
2222	return -ENODEV;
2223
2224	/*
2225	* Checks that the device driver indicated a valid zone size and that
2226	* the max zone append limit is set.
2227	*/
2228	if (!zone_sectors \|\| !is_power_of_2(n: zone_sectors)) {
2229	pr_warn("%s: Invalid non power of two zone size (%llu)\n",
2230	disk->disk_name, zone_sectors);
2231	return -ENODEV;
2232	}
2233
2234	/*
2235	* Ensure that all memory allocations in this context are done as if
2236	* GFP_NOIO was specified.
2237	*/
2238	noio_flag = memalloc_noio_save();
2239	ret = disk_revalidate_zone_resources(disk, args: &args);
2240	if (ret) {
2241	memalloc_noio_restore(flags: noio_flag);
2242	return ret;
2243	}
2244
2245	ret = disk->fops->report_zones(disk, `0`, UINT_MAX, &rep_args);
2246	if (!ret) {
2247	pr_warn("%s: No zones reported\n", disk->disk_name);
2248	ret = -ENODEV;
2249	}
2250	memalloc_noio_restore(flags: noio_flag);
2251
2252	/*
2253	* If zones where reported, make sure that the entire disk capacity
2254	* has been checked.
2255	*/
2256	if (ret > `0` && args.sector != capacity) {
2257	pr_warn("%s: Missing zones from sector %llu\n",
2258	disk->disk_name, args.sector);
2259	ret = -ENODEV;
2260	}
2261
2262	if (ret > `0`)
2263	return disk_update_zone_resources(disk, args: &args);
2264
2265	pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
2266
2267	memflags = blk_mq_freeze_queue(q);
2268	disk_free_zone_resources(disk);
2269	blk_mq_unfreeze_queue(q, memflags);
2270
2271	return ret;
2272	}
2273	EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
2274
2275	/**
2276	* blk_zone_issue_zeroout - zero-fill a block range in a zone
2277	* @bdev: blockdev to write
2278	* @sector: start sector
2279	* @nr_sects: number of sectors to write
2280	* @gfp_mask: memory allocation flags (for bio_alloc)
2281	*
2282	* Description:
2283	* Zero-fill a block range in a zone (@sector must be equal to the zone write
2284	* pointer), handling potential errors due to the (initially unknown) lack of
2285	* hardware offload (See blkdev_issue_zeroout()).
2286	*/
2287	int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
2288	sector_t nr_sects, gfp_t gfp_mask)
2289	{
2290	struct gendisk *disk = bdev->bd_disk;
2291	int ret;
2292
2293	if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
2294	return -EIO;
2295
2296	ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
2297	BLKDEV_ZERO_NOFALLBACK);
2298	if (ret != -EOPNOTSUPP)
2299	return ret;
2300
2301	/*
2302	* The failed call to blkdev_issue_zeroout() advanced the zone write
2303	* pointer. Undo this using a report zone to update the zone write
2304	* pointer to the correct current value.
2305	*/
2306	ret = disk->fops->report_zones(disk, sector, `1`, NULL);
2307	if (ret != `1`)
2308	return ret < `0` ? ret : -EIO;
2309
2310	/*
2311	* Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
2312	* regular write with zero-pages.
2313	*/
2314	return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, flags: `0`);
2315	}
2316	EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
2317
2318	#ifdef CONFIG_BLK_DEBUG_FS
2319	static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
2320	struct seq_file *m)
2321	{
2322	unsigned int zwp_wp_offset, zwp_flags;
2323	unsigned int zwp_zone_no, zwp_ref;
2324	unsigned int zwp_bio_list_size;
2325	enum blk_zone_cond zwp_cond;
2326	unsigned long flags;
2327
2328	spin_lock_irqsave(&zwplug->lock, flags);
2329	zwp_zone_no = zwplug->zone_no;
2330	zwp_flags = zwplug->flags;
2331	zwp_ref = refcount_read(r: &zwplug->ref);
2332	zwp_cond = zwplug->cond;
2333	zwp_wp_offset = zwplug->wp_offset;
2334	zwp_bio_list_size = bio_list_size(bl: &zwplug->bio_list);
2335	spin_unlock_irqrestore(lock: &zwplug->lock, flags);
2336
2337	seq_printf(m,
2338	fmt: "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n",
2339	zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond),
2340	zwp_wp_offset, zwp_bio_list_size);
2341	}
2342
2343	int queue_zone_wplugs_show(void data, struct* seq_file *m)
2344	{
2345	struct request_queue *q = data;
2346	struct gendisk *disk = q->disk;
2347	struct blk_zone_wplug *zwplug;
2348	unsigned int i;
2349
2350	if (!disk->zone_wplugs_hash)
2351	return `0`;
2352
2353	rcu_read_lock();
2354	for (i = `0`; i < disk_zone_wplugs_hash_size(disk); i++)
2355	hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
2356	node)
2357	queue_zone_wplug_show(zwplug, m);
2358	rcu_read_unlock();
2359
2360	return `0`;
2361	}
2362
2363	#endif
2364

source code of linux/block/blk-zoned.c