vmscan.c source code [linux/mm/vmscan.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
4	*
5	* Swap reorganised 29.12.95, Stephen Tweedie.
6	* kswapd added: 7.1.96 sct
7	* Removed kswapd_ctl limits, and swap out as many pages as needed
8	* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
9	* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
10	* Multiqueue VM started 5.8.00, Rik van Riel.
11	*/
12
13	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15	#include <linux/mm.h>
16	#include <linux/sched/mm.h>
17	#include <linux/module.h>
18	#include <linux/gfp.h>
19	#include <linux/kernel_stat.h>
20	#include <linux/swap.h>
21	#include <linux/pagemap.h>
22	#include <linux/init.h>
23	#include <linux/highmem.h>
24	#include <linux/vmpressure.h>
25	#include <linux/vmstat.h>
26	#include <linux/file.h>
27	#include <linux/writeback.h>
28	#include <linux/blkdev.h>
29	#include <linux/buffer_head.h> /* for buffer_heads_over_limit */
30	#include <linux/mm_inline.h>
31	#include <linux/backing-dev.h>
32	#include <linux/rmap.h>
33	#include <linux/topology.h>
34	#include <linux/cpu.h>
35	#include <linux/cpuset.h>
36	#include <linux/compaction.h>
37	#include <linux/notifier.h>
38	#include <linux/delay.h>
39	#include <linux/kthread.h>
40	#include <linux/freezer.h>
41	#include <linux/memcontrol.h>
42	#include <linux/migrate.h>
43	#include <linux/delayacct.h>
44	#include <linux/sysctl.h>
45	#include <linux/memory-tiers.h>
46	#include <linux/oom.h>
47	#include <linux/pagevec.h>
48	#include <linux/prefetch.h>
49	#include <linux/printk.h>
50	#include <linux/dax.h>
51	#include <linux/psi.h>
52	#include <linux/pagewalk.h>
53	#include <linux/shmem_fs.h>
54	#include <linux/ctype.h>
55	#include <linux/debugfs.h>
56	#include <linux/khugepaged.h>
57	#include <linux/rculist_nulls.h>
58	#include <linux/random.h>
59	#include <linux/mmu_notifier.h>
60	#include <linux/parser.h>
61
62	#include <asm/tlbflush.h>
63	#include <asm/div64.h>
64
65	#include <linux/swapops.h>
66	#include <linux/balloon_compaction.h>
67	#include <linux/sched/sysctl.h>
68
69	#include "internal.h"
70	#include "swap.h"
71
72	#define CREATE_TRACE_POINTS
73	#include <trace/events/vmscan.h>
74
75	struct scan_control {
76	/ How many pages shrink_list() should reclaim /
77	unsigned long nr_to_reclaim;
78
79	/*
80	* Nodemask of nodes allowed by the caller. If NULL, all nodes
81	* are scanned.
82	*/
83	nodemask_t *nodemask;
84
85	/*
86	* The memory cgroup that hit its limit and as a result is the
87	* primary target of this reclaim invocation.
88	*/
89	struct mem_cgroup *target_mem_cgroup;
90
91	/*
92	* Scan pressure balancing between anon and file LRUs
93	*/
94	unsigned long anon_cost;
95	unsigned long file_cost;
96
97	/ Swappiness value for proactive reclaim. Always use sc_swappiness()! /
98	int *proactive_swappiness;
99
100	/ Can active folios be deactivated as part of reclaim? /
101	#define DEACTIVATE_ANON 1
102	#define DEACTIVATE_FILE 2
103	unsigned int may_deactivate:`2`;
104	unsigned int force_deactivate:`1`;
105	unsigned int skipped_deactivate:`1`;
106
107	/ Writepage batching in laptop mode; RECLAIM_WRITE /
108	unsigned int may_writepage:`1`;
109
110	/ Can mapped folios be reclaimed? /
111	unsigned int may_unmap:`1`;
112
113	/ Can folios be swapped as part of reclaim? /
114	unsigned int may_swap:`1`;
115
116	/ Not allow cache_trim_mode to be turned on as part of reclaim? /
117	unsigned int no_cache_trim_mode:`1`;
118
119	/ Has cache_trim_mode failed at least once? /
120	unsigned int cache_trim_mode_failed:`1`;
121
122	/ Proactive reclaim invoked by userspace /
123	unsigned int proactive:`1`;
124
125	/*
126	* Cgroup memory below memory.low is protected as long as we
127	* don't threaten to OOM. If any cgroup is reclaimed at
128	* reduced force or passed over entirely due to its memory.low
129	* setting (memcg_low_skipped), and nothing is reclaimed as a
130	* result, then go back for one more cycle that reclaims the protected
131	* memory (memcg_low_reclaim) to avert OOM.
132	*/
133	unsigned int memcg_low_reclaim:`1`;
134	unsigned int memcg_low_skipped:`1`;
135
136	/ Shared cgroup tree walk failed, rescan the whole tree /
137	unsigned int memcg_full_walk:`1`;
138
139	unsigned int hibernation_mode:`1`;
140
141	/ One of the zones is ready for compaction /
142	unsigned int compaction_ready:`1`;
143
144	/ There is easily reclaimable cold cache in the current node /
145	unsigned int cache_trim_mode:`1`;
146
147	/ The file folios on the current node are dangerously low /
148	unsigned int file_is_tiny:`1`;
149
150	/ Always discard instead of demoting to lower tier memory /
151	unsigned int no_demotion:`1`;
152
153	/ Allocation order /
154	s8 order;
155
156	/ Scan (total_size >> priority) pages at once /
157	s8 priority;
158
159	/ The highest zone to isolate folios for reclaim from /
160	s8 reclaim_idx;
161
162	/ This context's GFP mask /
163	gfp_t gfp_mask;
164
165	/ Incremented by the number of inactive pages that were scanned /
166	unsigned long nr_scanned;
167
168	/ Number of pages freed so far during a call to shrink_zones() /
169	unsigned long nr_reclaimed;
170
171	struct {
172	unsigned int dirty;
173	unsigned int unqueued_dirty;
174	unsigned int congested;
175	unsigned int writeback;
176	unsigned int immediate;
177	unsigned int file_taken;
178	unsigned int taken;
179	} nr;
180
181	/ for recording the reclaimed slab by now /
182	struct reclaim_state reclaim_state;
183	};
184
185	#ifdef ARCH_HAS_PREFETCHW
186	#define prefetchw_prev_lru_folio(_folio, _base, _field) \
187	do { \
188	if ((_folio)->lru.prev != _base) { \
189	struct folio *prev; \
190	\
191	prev = lru_to_folio(&(_folio->lru)); \
192	prefetchw(&prev->_field); \
193	} \
194	} while (0)
195	#else
196	#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
197	#endif
198
199	/*
200	* From 0 .. MAX_SWAPPINESS. Higher means more swappy.
201	*/
202	int vm_swappiness = `60`;
203
204	#ifdef CONFIG_MEMCG
205
206	/ Returns true for reclaim through cgroup limits or cgroup interfaces. /
207	static bool cgroup_reclaim(struct scan_control *sc)
208	{
209	return sc->target_mem_cgroup;
210	}
211
212	/*
213	* Returns true for reclaim on the root cgroup. This is true for direct
214	* allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
215	*/
216	static bool root_reclaim(struct scan_control *sc)
217	{
218	return !sc->target_mem_cgroup \|\| mem_cgroup_is_root(memcg: sc->target_mem_cgroup);
219	}
220
221	/**
222	* writeback_throttling_sane - is the usual dirty throttling mechanism available?
223	* @sc: scan_control in question
224	*
225	* The normal page dirty throttling mechanism in balance_dirty_pages() is
226	* completely broken with the legacy memcg and direct stalling in
227	* shrink_folio_list() is used for throttling instead, which lacks all the
228	* niceties such as fairness, adaptive pausing, bandwidth proportional
229	* allocation and configurability.
230	*
231	* This function tests whether the vmscan currently in progress can assume
232	* that the normal dirty throttling mechanism is operational.
233	*/
234	static bool writeback_throttling_sane(struct scan_control *sc)
235	{
236	if (!cgroup_reclaim(sc))
237	return true;
238	#ifdef CONFIG_CGROUP_WRITEBACK
239	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
240	return true;
241	#endif
242	return false;
243	}
244
245	static int sc_swappiness(struct scan_control sc, struct* mem_cgroup *memcg)
246	{
247	if (sc->proactive && sc->proactive_swappiness)
248	return *sc->proactive_swappiness;
249	return mem_cgroup_swappiness(memcg);
250	}
251	#else
252	static bool cgroup_reclaim(struct scan_control *sc)
253	{
254	return false;
255	}
256
257	static bool root_reclaim(struct scan_control *sc)
258	{
259	return true;
260	}
261
262	static bool writeback_throttling_sane(struct scan_control *sc)
263	{
264	return true;
265	}
266
267	static int sc_swappiness(struct scan_control sc, struct* mem_cgroup *memcg)
268	{
269	return READ_ONCE(vm_swappiness);
270	}
271	#endif
272
273	/ for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to*
274	* and including the specified highidx
275	* @zone: The current zone in the iterator
276	* @pgdat: The pgdat which node_zones are being iterated
277	* @idx: The index variable
278	* @highidx: The index of the highest zone to return
279	*
280	* This macro iterates through all managed zones up to and including the specified highidx.
281	* The zone iterator enters an invalid state after macro call and must be reinitialized
282	* before it can be used again.
283	*/
284	#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
285	for ((idx) = 0, (zone) = (pgdat)->node_zones; \
286	(idx) <= (highidx); \
287	(idx)++, (zone)++) \
288	if (!managed_zone(zone)) \
289	continue; \
290	else
291
292	static void set_task_reclaim_state(struct task_struct *task,
293	struct reclaim_state *rs)
294	{
295	/ Check for an overwrite /
296	WARN_ON_ONCE(rs && task->reclaim_state);
297
298	/ Check for the nulling of an already-nulled member /
299	WARN_ON_ONCE(!rs && !task->reclaim_state);
300
301	task->reclaim_state = rs;
302	}
303
304	/*
305	* flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
306	* scan_control->nr_reclaimed.
307	*/
308	static void flush_reclaim_state(struct scan_control *sc)
309	{
310	/*
311	* Currently, reclaim_state->reclaimed includes three types of pages
312	* freed outside of vmscan:
313	* (1) Slab pages.
314	* (2) Clean file pages from pruned inodes (on highmem systems).
315	* (3) XFS freed buffer pages.
316	*
317	* For all of these cases, we cannot universally link the pages to a
318	* single memcg. For example, a memcg-aware shrinker can free one object
319	* charged to the target memcg, causing an entire page to be freed.
320	* If we count the entire page as reclaimed from the memcg, we end up
321	* overestimating the reclaimed amount (potentially under-reclaiming).
322	*
323	* Only count such pages for global reclaim to prevent under-reclaiming
324	* from the target memcg; preventing unnecessary retries during memcg
325	* charging and false positives from proactive reclaim.
326	*
327	* For uncommon cases where the freed pages were actually mostly
328	* charged to the target memcg, we end up underestimating the reclaimed
329	* amount. This should be fine. The freed pages will be uncharged
330	* anyway, even if they are not counted here properly, and we will be
331	* able to make forward progress in charging (which is usually in a
332	* retry loop).
333	*
334	* We can go one step further, and report the uncharged objcg pages in
335	* memcg reclaim, to make reporting more accurate and reduce
336	* underestimation, but it's probably not worth the complexity for now.
337	*/
338	if (current->reclaim_state && root_reclaim(sc)) {
339	sc->nr_reclaimed += current->reclaim_state->reclaimed;
340	current->reclaim_state->reclaimed = `0`;
341	}
342	}
343
344	static bool can_demote(int nid, struct scan_control *sc,
345	struct mem_cgroup *memcg)
346	{
347	int demotion_nid;
348
349	if (!numa_demotion_enabled)
350	return false;
351	if (sc && sc->no_demotion)
352	return false;
353
354	demotion_nid = next_demotion_node(node: nid);
355	if (demotion_nid == NUMA_NO_NODE)
356	return false;
357
358	/ If demotion node isn't in the cgroup's mems_allowed, fall back /
359	return mem_cgroup_node_allowed(memcg, nid: demotion_nid);
360	}
361
362	static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
363	int nid,
364	struct scan_control *sc)
365	{
366	if (memcg == NULL) {
367	/*
368	* For non-memcg reclaim, is there
369	* space in any swap device?
370	*/
371	if (get_nr_swap_pages() > `0`)
372	return true;
373	} else {
374	/ Is the memcg below its swap limit? /
375	if (mem_cgroup_get_nr_swap_pages(memcg) > `0`)
376	return true;
377	}
378
379	/*
380	* The page can not be swapped.
381	*
382	* Can it be reclaimed from this node via demotion?
383	*/
384	return can_demote(nid, sc, memcg);
385	}
386
387	/*
388	* This misses isolated folios which are not accounted for to save counters.
389	* As the data only determines if reclaim or compaction continues, it is
390	* not expected that isolated folios will be a dominating factor.
391	*/
392	unsigned long zone_reclaimable_pages(struct zone *zone)
393	{
394	unsigned long nr;
395
396	nr = zone_page_state_snapshot(zone, item: NR_ZONE_INACTIVE_FILE) +
397	zone_page_state_snapshot(zone, item: NR_ZONE_ACTIVE_FILE);
398	if (can_reclaim_anon_pages(NULL, nid: zone_to_nid(zone), NULL))
399	nr += zone_page_state_snapshot(zone, item: NR_ZONE_INACTIVE_ANON) +
400	zone_page_state_snapshot(zone, item: NR_ZONE_ACTIVE_ANON);
401
402	return nr;
403	}
404
405	/**
406	* lruvec_lru_size - Returns the number of pages on the given LRU list.
407	* @lruvec: lru vector
408	* @lru: lru to use
409	* @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
410	*/
411	static unsigned long lruvec_lru_size(struct lruvec lruvec, enum* lru_list lru,
412	int zone_idx)
413	{
414	unsigned long size = `0`;
415	int zid;
416	struct zone *zone;
417
418	for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
419	if (!mem_cgroup_disabled())
420	size += mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx: zid);
421	else
422	size += zone_page_state(zone, item: NR_ZONE_LRU_BASE + lru);
423	}
424	return size;
425	}
426
427	static unsigned long drop_slab_node(int nid)
428	{
429	unsigned long freed = `0`;
430	struct mem_cgroup *memcg = NULL;
431
432	memcg = mem_cgroup_iter(NULL, NULL, NULL);
433	do {
434	freed += shrink_slab(GFP_KERNEL, nid, memcg, priority: `0`);
435	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
436
437	return freed;
438	}
439
440	void drop_slab(void)
441	{
442	int nid;
443	int shift = `0`;
444	unsigned long freed;
445
446	do {
447	freed = `0`;
448	for_each_online_node(nid) {
449	if (fatal_signal_pending(current))
450	return;
451
452	freed += drop_slab_node(nid);
453	}
454	} while ((freed >> shift++) > `1`);
455	}
456
457	#define CHECK_RECLAIMER_OFFSET(type) \
458	do { \
459	BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
460	PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
461	BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
462	PGSCAN_##type - PGSCAN_KSWAPD); \
463	} while (0)
464
465	static int reclaimer_offset(struct scan_control *sc)
466	{
467	CHECK_RECLAIMER_OFFSET(DIRECT);
468	CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
469	CHECK_RECLAIMER_OFFSET(PROACTIVE);
470
471	if (current_is_kswapd())
472	return `0`;
473	if (current_is_khugepaged())
474	return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
475	if (sc->proactive)
476	return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
477	return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
478	}
479
480	/*
481	* We detected a synchronous write error writing a folio out. Probably
482	* -ENOSPC. We need to propagate that into the address_space for a subsequent
483	* fsync(), msync() or close().
484	*
485	* The tricky part is that after writepage we cannot touch the mapping: nothing
486	* prevents it from being freed up. But we have a ref on the folio and once
487	* that folio is locked, the mapping is pinned.
488	*
489	* We're allowed to run sleeping folio_lock() here because we know the caller has
490	* __GFP_FS.
491	*/
492	static void handle_write_error(struct address_space *mapping,
493	struct folio folio, int* error)
494	{
495	folio_lock(folio);
496	if (folio_mapping(folio) == mapping)
497	mapping_set_error(mapping, error);
498	folio_unlock(folio);
499	}
500
501	static bool skip_throttle_noprogress(pg_data_t *pgdat)
502	{
503	int reclaimable = `0`, write_pending = `0`;
504	int i;
505	struct zone *zone;
506	/*
507	* If kswapd is disabled, reschedule if necessary but do not
508	* throttle as the system is likely near OOM.
509	*/
510	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
511	return true;
512
513	/*
514	* If there are a lot of dirty/writeback folios then do not
515	* throttle as throttling will occur when the folios cycle
516	* towards the end of the LRU if still under writeback.
517	*/
518	for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - `1`) {
519	reclaimable += zone_reclaimable_pages(zone);
520	write_pending += zone_page_state_snapshot(zone,
521	item: NR_ZONE_WRITE_PENDING);
522	}
523	if (`2` * write_pending <= reclaimable)
524	return true;
525
526	return false;
527	}
528
529	void reclaim_throttle(pg_data_t pgdat, enum* vmscan_throttle_state reason)
530	{
531	wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
532	long timeout, ret;
533	DEFINE_WAIT(wait);
534
535	/*
536	* Do not throttle user workers, kthreads other than kswapd or
537	* workqueues. They may be required for reclaim to make
538	* forward progress (e.g. journalling workqueues or kthreads).
539	*/
540	if (!current_is_kswapd() &&
541	current->flags & (PF_USER_WORKER\|PF_KTHREAD)) {
542	cond_resched();
543	return;
544	}
545
546	/*
547	* These figures are pulled out of thin air.
548	* VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
549	* parallel reclaimers which is a short-lived event so the timeout is
550	* short. Failing to make progress or waiting on writeback are
551	* potentially long-lived events so use a longer timeout. This is shaky
552	* logic as a failure to make progress could be due to anything from
553	* writeback to a slow device to excessive referenced folios at the tail
554	* of the inactive LRU.
555	*/
556	switch(reason) {
557	case VMSCAN_THROTTLE_WRITEBACK:
558	timeout = HZ/`10`;
559
560	if (atomic_inc_return(v: &pgdat->nr_writeback_throttled) == `1`) {
561	WRITE_ONCE(pgdat->nr_reclaim_start,
562	node_page_state(pgdat, NR_THROTTLED_WRITTEN));
563	}
564
565	break;
566	case VMSCAN_THROTTLE_CONGESTED:
567	fallthrough;
568	case VMSCAN_THROTTLE_NOPROGRESS:
569	if (skip_throttle_noprogress(pgdat)) {
570	cond_resched();
571	return;
572	}
573
574	timeout = `1`;
575
576	break;
577	case VMSCAN_THROTTLE_ISOLATED:
578	timeout = HZ/`50`;
579	break;
580	default:
581	WARN_ON_ONCE(`1`);
582	timeout = HZ;
583	break;
584	}
585
586	prepare_to_wait(wq_head: wqh, wq_entry: &wait, TASK_UNINTERRUPTIBLE);
587	ret = schedule_timeout(timeout);
588	finish_wait(wq_head: wqh, wq_entry: &wait);
589
590	if (reason == VMSCAN_THROTTLE_WRITEBACK)
591	atomic_dec(v: &pgdat->nr_writeback_throttled);
592
593	trace_mm_vmscan_throttled(nid: pgdat->node_id, usec_timeout: jiffies_to_usecs(j: timeout),
594	usec_delayed: jiffies_to_usecs(j: timeout - ret),
595	reason);
596	}
597
598	/*
599	* Account for folios written if tasks are throttled waiting on dirty
600	* folios to clean. If enough folios have been cleaned since throttling
601	* started then wakeup the throttled tasks.
602	*/
603	void __acct_reclaim_writeback(pg_data_t pgdat, struct* folio *folio,
604	int nr_throttled)
605	{
606	unsigned long nr_written;
607
608	node_stat_add_folio(folio, item: NR_THROTTLED_WRITTEN);
609
610	/*
611	* This is an inaccurate read as the per-cpu deltas may not
612	* be synchronised. However, given that the system is
613	* writeback throttled, it is not worth taking the penalty
614	* of getting an accurate count. At worst, the throttle
615	* timeout guarantees forward progress.
616	*/
617	nr_written = node_page_state(pgdat, item: NR_THROTTLED_WRITTEN) -
618	READ_ONCE(pgdat->nr_reclaim_start);
619
620	if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
621	wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
622	}
623
624	/ possible outcome of pageout() /
625	typedef enum {
626	/ failed to write folio out, folio is locked /
627	PAGE_KEEP,
628	/ move folio to the active list, folio is locked /
629	PAGE_ACTIVATE,
630	/ folio has been sent to the disk successfully, folio is unlocked /
631	PAGE_SUCCESS,
632	/ folio is clean and locked /
633	PAGE_CLEAN,
634	} pageout_t;
635
636	static pageout_t writeout(struct folio folio, struct* address_space *mapping,
637	struct swap_iocb plug, struct** list_head *folio_list)
638	{
639	int res;
640
641	folio_set_reclaim(folio);
642
643	/*
644	* The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
645	* or we failed to allocate contiguous swap entries, in which case
646	* the split out folios get added back to folio_list.
647	*/
648	if (shmem_mapping(mapping))
649	res = shmem_writeout(folio, plug, folio_list);
650	else
651	res = swap_writeout(folio, swap_plug: plug);
652
653	if (res < `0`)
654	handle_write_error(mapping, folio, error: res);
655	if (res == AOP_WRITEPAGE_ACTIVATE) {
656	folio_clear_reclaim(folio);
657	return PAGE_ACTIVATE;
658	}
659
660	/ synchronous write? /
661	if (!folio_test_writeback(folio))
662	folio_clear_reclaim(folio);
663
664	trace_mm_vmscan_write_folio(folio);
665	node_stat_add_folio(folio, item: NR_VMSCAN_WRITE);
666	return PAGE_SUCCESS;
667	}
668
669	/*
670	* pageout is called by shrink_folio_list() for each dirty folio.
671	*/
672	static pageout_t pageout(struct folio folio, struct* address_space *mapping,
673	struct swap_iocb plug, struct** list_head *folio_list)
674	{
675	/*
676	* We no longer attempt to writeback filesystem folios here, other
677	* than tmpfs/shmem. That's taken care of in page-writeback.
678	* If we find a dirty filesystem folio at the end of the LRU list,
679	* typically that means the filesystem is saturating the storage
680	* with contiguous writes and telling it to write a folio here
681	* would only make the situation worse by injecting an element
682	* of random access.
683	*
684	* If the folio is swapcache, write it back even if that would
685	* block, for some throttling. This happens by accident, because
686	* swap_backing_dev_info is bust: it doesn't reflect the
687	* congestion state of the swapdevs. Easy to fix, if needed.
688	*
689	* A freeable shmem or swapcache folio is referenced only by the
690	* caller that isolated the folio and the page cache.
691	*/
692	if (folio_ref_count(folio) != `1` + folio_nr_pages(folio) \|\| !mapping)
693	return PAGE_KEEP;
694	if (!shmem_mapping(mapping) && !folio_test_anon(folio))
695	return PAGE_ACTIVATE;
696	if (!folio_clear_dirty_for_io(folio))
697	return PAGE_CLEAN;
698	return writeout(folio, mapping, plug, folio_list);
699	}
700
701	/*
702	* Same as remove_mapping, but if the folio is removed from the mapping, it
703	* gets returned with a refcount of 0.
704	*/
705	static int __remove_mapping(struct address_space mapping, struct* folio *folio,
706	bool reclaimed, struct mem_cgroup *target_memcg)
707	{
708	int refcount;
709	void *shadow = NULL;
710	struct swap_cluster_info *ci;
711
712	BUG_ON(!folio_test_locked(folio));
713	BUG_ON(mapping != folio_mapping(folio));
714
715	if (folio_test_swapcache(folio)) {
716	ci = swap_cluster_get_and_lock_irq(folio);
717	} else {
718	spin_lock(lock: &mapping->host->i_lock);
719	xa_lock_irq(&mapping->i_pages);
720	}
721
722	/*
723	* The non racy check for a busy folio.
724	*
725	* Must be careful with the order of the tests. When someone has
726	* a ref to the folio, it may be possible that they dirty it then
727	* drop the reference. So if the dirty flag is tested before the
728	* refcount here, then the following race may occur:
729	*
730	* get_user_pages(&page);
731	* [user mapping goes away]
732	* write_to(page);
733	* !folio_test_dirty(folio) [good]
734	* folio_set_dirty(folio);
735	* folio_put(folio);
736	* !refcount(folio) [good, discard it]
737	*
738	* [oops, our write_to data is lost]
739	*
740	* Reversing the order of the tests ensures such a situation cannot
741	* escape unnoticed. The smp_rmb is needed to ensure the folio->flags
742	* load is not satisfied before that of folio->_refcount.
743	*
744	* Note that if the dirty flag is always set via folio_mark_dirty,
745	* and thus under the i_pages lock, then this ordering is not required.
746	*/
747	refcount = `1` + folio_nr_pages(folio);
748	if (!folio_ref_freeze(folio, count: refcount))
749	goto cannot_free;
750	/ note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb /
751	if (unlikely(folio_test_dirty(folio))) {
752	folio_ref_unfreeze(folio, count: refcount);
753	goto cannot_free;
754	}
755
756	if (folio_test_swapcache(folio)) {
757	swp_entry_t swap = folio->swap;
758
759	if (reclaimed && !mapping_exiting(mapping))
760	shadow = workingset_eviction(folio, target_memcg);
761	__swap_cache_del_folio(ci, folio, entry: swap, shadow);
762	memcg1_swapout(folio, entry: swap);
763	swap_cluster_unlock_irq(ci);
764	put_swap_folio(folio, entry: swap);
765	} else {
766	void (free_folio)(struct* folio *);
767
768	free_folio = mapping->a_ops->free_folio;
769	/*
770	* Remember a shadow entry for reclaimed file cache in
771	* order to detect refaults, thus thrashing, later on.
772	*
773	* But don't store shadows in an address space that is
774	* already exiting. This is not just an optimization,
775	* inode reclaim needs to empty out the radix tree or
776	* the nodes are lost. Don't plant shadows behind its
777	* back.
778	*
779	* We also don't store shadows for DAX mappings because the
780	* only page cache folios found in these are zero pages
781	* covering holes, and because we don't want to mix DAX
782	* exceptional entries and shadow exceptional entries in the
783	* same address_space.
784	*/
785	if (reclaimed && folio_is_file_lru(folio) &&
786	!mapping_exiting(mapping) && !dax_mapping(mapping))
787	shadow = workingset_eviction(folio, target_memcg);
788	__filemap_remove_folio(folio, shadow);
789	xa_unlock_irq(&mapping->i_pages);
790	if (mapping_shrinkable(mapping))
791	inode_lru_list_add(inode: mapping->host);
792	spin_unlock(lock: &mapping->host->i_lock);
793
794	if (free_folio)
795	free_folio(folio);
796	}
797
798	return `1`;
799
800	cannot_free:
801	if (folio_test_swapcache(folio)) {
802	swap_cluster_unlock_irq(ci);
803	} else {
804	xa_unlock_irq(&mapping->i_pages);
805	spin_unlock(lock: &mapping->host->i_lock);
806	}
807	return `0`;
808	}
809
810	/**
811	* remove_mapping() - Attempt to remove a folio from its mapping.
812	* @mapping: The address space.
813	* @folio: The folio to remove.
814	*
815	* If the folio is dirty, under writeback or if someone else has a ref
816	* on it, removal will fail.
817	* Return: The number of pages removed from the mapping. 0 if the folio
818	* could not be removed.
819	* Context: The caller should have a single refcount on the folio and
820	* hold its lock.
821	*/
822	long remove_mapping(struct address_space mapping, struct* folio *folio)
823	{
824	if (__remove_mapping(mapping, folio, reclaimed: false, NULL)) {
825	/*
826	* Unfreezing the refcount with 1 effectively
827	* drops the pagecache ref for us without requiring another
828	* atomic operation.
829	*/
830	folio_ref_unfreeze(folio, count: `1`);
831	return folio_nr_pages(folio);
832	}
833	return `0`;
834	}
835
836	/**
837	* folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
838	* @folio: Folio to be returned to an LRU list.
839	*
840	* Add previously isolated @folio to appropriate LRU list.
841	* The folio may still be unevictable for other reasons.
842	*
843	* Context: lru_lock must not be held, interrupts must be enabled.
844	*/
845	void folio_putback_lru(struct folio *folio)
846	{
847	folio_add_lru(folio);
848	folio_put(folio); / drop ref from isolate /
849	}
850
851	enum folio_references {
852	FOLIOREF_RECLAIM,
853	FOLIOREF_RECLAIM_CLEAN,
854	FOLIOREF_KEEP,
855	FOLIOREF_ACTIVATE,
856	};
857
858	#ifdef CONFIG_LRU_GEN
859	/*
860	* Only used on a mapped folio in the eviction (rmap walk) path, where promotion
861	* needs to be done by taking the folio off the LRU list and then adding it back
862	* with PG_active set. In contrast, the aging (page table walk) path uses
863	* folio_update_gen().
864	*/
865	static bool lru_gen_set_refs(struct folio *folio)
866	{
867	/ see the comment on LRU_REFS_FLAGS /
868	if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
869	set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
870	return false;
871	}
872
873	set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
874	return true;
875	}
876	#else
877	static bool lru_gen_set_refs(struct folio *folio)
878	{
879	return false;
880	}
881	#endif /* CONFIG_LRU_GEN */
882
883	static enum folio_references folio_check_references(struct folio *folio,
884	struct scan_control *sc)
885	{
886	int referenced_ptes, referenced_folio;
887	vm_flags_t vm_flags;
888
889	referenced_ptes = folio_referenced(folio, is_locked: `1`, memcg: sc->target_mem_cgroup,
890	vm_flags: &vm_flags);
891
892	/*
893	* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
894	* Let the folio, now marked Mlocked, be moved to the unevictable list.
895	*/
896	if (vm_flags & VM_LOCKED)
897	return FOLIOREF_ACTIVATE;
898
899	/*
900	* There are two cases to consider.
901	* 1) Rmap lock contention: rotate.
902	* 2) Skip the non-shared swapbacked folio mapped solely by
903	* the exiting or OOM-reaped process.
904	*/
905	if (referenced_ptes == -`1`)
906	return FOLIOREF_KEEP;
907
908	if (lru_gen_enabled()) {
909	if (!referenced_ptes)
910	return FOLIOREF_RECLAIM;
911
912	return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
913	}
914
915	referenced_folio = folio_test_clear_referenced(folio);
916
917	if (referenced_ptes) {
918	/*
919	* All mapped folios start out with page table
920	* references from the instantiating fault, so we need
921	* to look twice if a mapped file/anon folio is used more
922	* than once.
923	*
924	* Mark it and spare it for another trip around the
925	* inactive list. Another page table reference will
926	* lead to its activation.
927	*
928	* Note: the mark is set for activated folios as well
929	* so that recently deactivated but used folios are
930	* quickly recovered.
931	*/
932	folio_set_referenced(folio);
933
934	if (referenced_folio \|\| referenced_ptes > `1`)
935	return FOLIOREF_ACTIVATE;
936
937	/*
938	* Activate file-backed executable folios after first usage.
939	*/
940	if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
941	return FOLIOREF_ACTIVATE;
942
943	return FOLIOREF_KEEP;
944	}
945
946	/ Reclaim if clean, defer dirty folios to writeback /
947	if (referenced_folio && folio_is_file_lru(folio))
948	return FOLIOREF_RECLAIM_CLEAN;
949
950	return FOLIOREF_RECLAIM;
951	}
952
953	/ Check if a folio is dirty or under writeback /
954	static void folio_check_dirty_writeback(struct folio *folio,
955	bool dirty, bool writeback)
956	{
957	struct address_space *mapping;
958
959	/*
960	* Anonymous folios are not handled by flushers and must be written
961	* from reclaim context. Do not stall reclaim based on them.
962	* MADV_FREE anonymous folios are put into inactive file list too.
963	* They could be mistakenly treated as file lru. So further anon
964	* test is needed.
965	*/
966	if (!folio_is_file_lru(folio) \|\|
967	(folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
968	*dirty = false;
969	*writeback = false;
970	return;
971	}
972
973	/ By default assume that the folio flags are accurate /
974	*dirty = folio_test_dirty(folio);
975	*writeback = folio_test_writeback(folio);
976
977	/ Verify dirty/writeback state if the filesystem supports it /
978	if (!folio_test_private(folio))
979	return;
980
981	mapping = folio_mapping(folio);
982	if (mapping && mapping->a_ops->is_dirty_writeback)
983	mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
984	}
985
986	static struct folio alloc_demote_folio(struct* folio *src,
987	unsigned long private)
988	{
989	struct folio *dst;
990	nodemask_t *allowed_mask;
991	struct migration_target_control *mtc;
992
993	mtc = (struct migration_target_control *)private;
994
995	allowed_mask = mtc->nmask;
996	/*
997	* make sure we allocate from the target node first also trying to
998	* demote or reclaim pages from the target node via kswapd if we are
999	* low on free memory on target node. If we don't do this and if
1000	* we have free memory on the slower(lower) memtier, we would start
1001	* allocating pages from slower(lower) memory tiers without even forcing
1002	* a demotion of cold pages from the target memtier. This can result
1003	* in the kernel placing hot pages in slower(lower) memory tiers.
1004	*/
1005	mtc->nmask = NULL;
1006	mtc->gfp_mask \|= __GFP_THISNODE;
1007	dst = alloc_migration_target(src, private: (unsigned long)mtc);
1008	if (dst)
1009	return dst;
1010
1011	mtc->gfp_mask &= ~__GFP_THISNODE;
1012	mtc->nmask = allowed_mask;
1013
1014	return alloc_migration_target(src, private: (unsigned long)mtc);
1015	}
1016
1017	/*
1018	* Take folios on @demote_folios and attempt to demote them to another node.
1019	* Folios which are not demoted are left on @demote_folios.
1020	*/
1021	static unsigned int demote_folio_list(struct list_head *demote_folios,
1022	struct pglist_data *pgdat)
1023	{
1024	int target_nid = next_demotion_node(node: pgdat->node_id);
1025	unsigned int nr_succeeded;
1026	nodemask_t allowed_mask;
1027
1028	struct migration_target_control mtc = {
1029	/*
1030	* Allocate from 'node', or fail quickly and quietly.
1031	* When this happens, 'page' will likely just be discarded
1032	* instead of migrated.
1033	*/
1034	.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) \|
1035	__GFP_NOMEMALLOC \| GFP_NOWAIT,
1036	.nid = target_nid,
1037	.nmask = &allowed_mask,
1038	.reason = MR_DEMOTION,
1039	};
1040
1041	if (list_empty(head: demote_folios))
1042	return `0`;
1043
1044	if (target_nid == NUMA_NO_NODE)
1045	return `0`;
1046
1047	node_get_allowed_targets(pgdat, targets: &allowed_mask);
1048
1049	/ Demotion ignores all cpuset and mempolicy settings /
1050	migrate_pages(l: demote_folios, new: alloc_demote_folio, NULL,
1051	private: (unsigned long)&mtc, mode: MIGRATE_ASYNC, reason: MR_DEMOTION,
1052	ret_succeeded: &nr_succeeded);
1053
1054	return nr_succeeded;
1055	}
1056
1057	static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
1058	{
1059	if (gfp_mask & __GFP_FS)
1060	return true;
1061	if (!folio_test_swapcache(folio) \|\| !(gfp_mask & __GFP_IO))
1062	return false;
1063	/*
1064	* We can "enter_fs" for swap-cache with only __GFP_IO
1065	* providing this isn't SWP_FS_OPS.
1066	* ->flags can be updated non-atomicially (scan_swap_map_slots),
1067	* but that will never affect SWP_FS_OPS, so the data_race
1068	* is safe.
1069	*/
1070	return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
1071	}
1072
1073	/*
1074	* shrink_folio_list() returns the number of reclaimed pages
1075	*/
1076	static unsigned int shrink_folio_list(struct list_head *folio_list,
1077	struct pglist_data pgdat, struct* scan_control *sc,
1078	struct reclaim_stat *stat, bool ignore_references,
1079	struct mem_cgroup *memcg)
1080	{
1081	struct folio_batch free_folios;
1082	LIST_HEAD(ret_folios);
1083	LIST_HEAD(demote_folios);
1084	unsigned int nr_reclaimed = `0`, nr_demoted = `0`;
1085	unsigned int pgactivate = `0`;
1086	bool do_demote_pass;
1087	struct swap_iocb *plug = NULL;
1088
1089	folio_batch_init(fbatch: &free_folios);
1090	memset(stat, `0`, sizeof(*stat));
1091	cond_resched();
1092	do_demote_pass = can_demote(nid: pgdat->node_id, sc, memcg);
1093
1094	retry:
1095	while (!list_empty(head: folio_list)) {
1096	struct address_space *mapping;
1097	struct folio *folio;
1098	enum folio_references references = FOLIOREF_RECLAIM;
1099	bool dirty, writeback;
1100	unsigned int nr_pages;
1101
1102	cond_resched();
1103
1104	folio = lru_to_folio(head: folio_list);
1105	list_del(entry: &folio->lru);
1106
1107	if (!folio_trylock(folio))
1108	goto keep;
1109
1110	if (folio_contain_hwpoisoned_page(folio)) {
1111	/*
1112	* unmap_poisoned_folio() can't handle large
1113	* folio, just skip it. memory_failure() will
1114	* handle it if the UCE is triggered again.
1115	*/
1116	if (folio_test_large(folio))
1117	goto keep_locked;
1118
1119	unmap_poisoned_folio(folio, pfn: folio_pfn(folio), must_kill: false);
1120	folio_unlock(folio);
1121	folio_put(folio);
1122	continue;
1123	}
1124
1125	VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1126
1127	nr_pages = folio_nr_pages(folio);
1128
1129	/ Account the number of base pages /
1130	sc->nr_scanned += nr_pages;
1131
1132	if (unlikely(!folio_evictable(folio)))
1133	goto activate_locked;
1134
1135	if (!sc->may_unmap && folio_mapped(folio))
1136	goto keep_locked;
1137
1138	/*
1139	* The number of dirty pages determines if a node is marked
1140	* reclaim_congested. kswapd will stall and start writing
1141	* folios if the tail of the LRU is all dirty unqueued folios.
1142	*/
1143	folio_check_dirty_writeback(folio, dirty: &dirty, writeback: &writeback);
1144	if (dirty \|\| writeback)
1145	stat->nr_dirty += nr_pages;
1146
1147	if (dirty && !writeback)
1148	stat->nr_unqueued_dirty += nr_pages;
1149
1150	/*
1151	* Treat this folio as congested if folios are cycling
1152	* through the LRU so quickly that the folios marked
1153	* for immediate reclaim are making it to the end of
1154	* the LRU a second time.
1155	*/
1156	if (writeback && folio_test_reclaim(folio))
1157	stat->nr_congested += nr_pages;
1158
1159	/*
1160	* If a folio at the tail of the LRU is under writeback, there
1161	* are three cases to consider.
1162	*
1163	* 1) If reclaim is encountering an excessive number
1164	* of folios under writeback and this folio has both
1165	* the writeback and reclaim flags set, then it
1166	* indicates that folios are being queued for I/O but
1167	* are being recycled through the LRU before the I/O
1168	* can complete. Waiting on the folio itself risks an
1169	* indefinite stall if it is impossible to writeback
1170	* the folio due to I/O error or disconnected storage
1171	* so instead note that the LRU is being scanned too
1172	* quickly and the caller can stall after the folio
1173	* list has been processed.
1174	*
1175	* 2) Global or new memcg reclaim encounters a folio that is
1176	* not marked for immediate reclaim, or the caller does not
1177	* have __GFP_FS (or __GFP_IO if it's simply going to swap,
1178	* not to fs), or the folio belongs to a mapping where
1179	* waiting on writeback during reclaim may lead to a deadlock.
1180	* In this case mark the folio for immediate reclaim and
1181	* continue scanning.
1182	*
1183	* Require may_enter_fs() because we would wait on fs, which
1184	* may not have submitted I/O yet. And the loop driver might
1185	* enter reclaim, and deadlock if it waits on a folio for
1186	* which it is needed to do the write (loop masks off
1187	* __GFP_IO\|__GFP_FS for this reason); but more thought
1188	* would probably show more reasons.
1189	*
1190	* 3) Legacy memcg encounters a folio that already has the
1191	* reclaim flag set. memcg does not have any dirty folio
1192	* throttling so we could easily OOM just because too many
1193	* folios are in writeback and there is nothing else to
1194	* reclaim. Wait for the writeback to complete.
1195	*
1196	* In cases 1) and 2) we activate the folios to get them out of
1197	* the way while we continue scanning for clean folios on the
1198	* inactive list and refilling from the active list. The
1199	* observation here is that waiting for disk writes is more
1200	* expensive than potentially causing reloads down the line.
1201	* Since they're marked for immediate reclaim, they won't put
1202	* memory pressure on the cache working set any longer than it
1203	* takes to write them to disk.
1204	*/
1205	if (folio_test_writeback(folio)) {
1206	mapping = folio_mapping(folio);
1207
1208	/ Case 1 above /
1209	if (current_is_kswapd() &&
1210	folio_test_reclaim(folio) &&
1211	test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1212	stat->nr_immediate += nr_pages;
1213	goto activate_locked;
1214
1215	/ Case 2 above /
1216	} else if (writeback_throttling_sane(sc) \|\|
1217	!folio_test_reclaim(folio) \|\|
1218	!may_enter_fs(folio, gfp_mask: sc->gfp_mask) \|\|
1219	(mapping &&
1220	mapping_writeback_may_deadlock_on_reclaim(mapping))) {
1221	/*
1222	* This is slightly racy -
1223	* folio_end_writeback() might have
1224	* just cleared the reclaim flag, then
1225	* setting the reclaim flag here ends up
1226	* interpreted as the readahead flag - but
1227	* that does not matter enough to care.
1228	* What we do want is for this folio to
1229	* have the reclaim flag set next time
1230	* memcg reclaim reaches the tests above,
1231	* so it will then wait for writeback to
1232	* avoid OOM; and it's also appropriate
1233	* in global reclaim.
1234	*/
1235	folio_set_reclaim(folio);
1236	stat->nr_writeback += nr_pages;
1237	goto activate_locked;
1238
1239	/ Case 3 above /
1240	} else {
1241	folio_unlock(folio);
1242	folio_wait_writeback(folio);
1243	/ then go back and try same folio again /
1244	list_add_tail(new: &folio->lru, head: folio_list);
1245	continue;
1246	}
1247	}
1248
1249	if (!ignore_references)
1250	references = folio_check_references(folio, sc);
1251
1252	switch (references) {
1253	case FOLIOREF_ACTIVATE:
1254	goto activate_locked;
1255	case FOLIOREF_KEEP:
1256	stat->nr_ref_keep += nr_pages;
1257	goto keep_locked;
1258	case FOLIOREF_RECLAIM:
1259	case FOLIOREF_RECLAIM_CLEAN:
1260	; / try to reclaim the folio below /
1261	}
1262
1263	/*
1264	* Before reclaiming the folio, try to relocate
1265	* its contents to another node.
1266	*/
1267	if (do_demote_pass &&
1268	(thp_migration_supported() \|\| !folio_test_large(folio))) {
1269	list_add(new: &folio->lru, head: &demote_folios);
1270	folio_unlock(folio);
1271	continue;
1272	}
1273
1274	/*
1275	* Anonymous process memory has backing store?
1276	* Try to allocate it some swap space here.
1277	* Lazyfree folio could be freed directly
1278	*/
1279	if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
1280	if (!folio_test_swapcache(folio)) {
1281	if (!(sc->gfp_mask & __GFP_IO))
1282	goto keep_locked;
1283	if (folio_maybe_dma_pinned(folio))
1284	goto keep_locked;
1285	if (folio_test_large(folio)) {
1286	/ cannot split folio, skip it /
1287	if (folio_expected_ref_count(folio) !=
1288	folio_ref_count(folio) - `1`)
1289	goto activate_locked;
1290	/*
1291	* Split partially mapped folios right away.
1292	* We can free the unmapped pages without IO.
1293	*/
1294	if (data_race(!list_empty(&folio->_deferred_list) &&
1295	folio_test_partially_mapped(folio)) &&
1296	split_folio_to_list(folio, list: folio_list))
1297	goto activate_locked;
1298	}
1299	if (folio_alloc_swap(folio)) {
1300	int __maybe_unused order = folio_order(folio);
1301
1302	if (!folio_test_large(folio))
1303	goto activate_locked_split;
1304	/ Fallback to swap normal pages /
1305	if (split_folio_to_list(folio, list: folio_list))
1306	goto activate_locked;
1307	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1308	if (nr_pages >= HPAGE_PMD_NR) {
1309	count_memcg_folio_events(folio,
1310	idx: THP_SWPOUT_FALLBACK, nr: `1`);
1311	count_vm_event(item: THP_SWPOUT_FALLBACK);
1312	}
1313	#endif
1314	count_mthp_stat(order, item: MTHP_STAT_SWPOUT_FALLBACK);
1315	if (folio_alloc_swap(folio))
1316	goto activate_locked_split;
1317	}
1318	/*
1319	* Normally the folio will be dirtied in unmap because its
1320	* pte should be dirty. A special case is MADV_FREE page. The
1321	* page's pte could have dirty bit cleared but the folio's
1322	* SwapBacked flag is still set because clearing the dirty bit
1323	* and SwapBacked flag has no lock protected. For such folio,
1324	* unmap will not set dirty bit for it, so folio reclaim will
1325	* not write the folio out. This can cause data corruption when
1326	* the folio is swapped in later. Always setting the dirty flag
1327	* for the folio solves the problem.
1328	*/
1329	folio_mark_dirty(folio);
1330	}
1331	}
1332
1333	/*
1334	* If the folio was split above, the tail pages will make
1335	* their own pass through this function and be accounted
1336	* then.
1337	*/
1338	if ((nr_pages > `1`) && !folio_test_large(folio)) {
1339	sc->nr_scanned -= (nr_pages - `1`);
1340	nr_pages = `1`;
1341	}
1342
1343	/*
1344	* The folio is mapped into the page tables of one or more
1345	* processes. Try to unmap it here.
1346	*/
1347	if (folio_mapped(folio)) {
1348	enum ttu_flags flags = TTU_BATCH_FLUSH;
1349	bool was_swapbacked = folio_test_swapbacked(folio);
1350
1351	if (folio_test_pmd_mappable(folio))
1352	flags \|= TTU_SPLIT_HUGE_PMD;
1353	/*
1354	* Without TTU_SYNC, try_to_unmap will only begin to
1355	* hold PTL from the first present PTE within a large
1356	* folio. Some initial PTEs might be skipped due to
1357	* races with parallel PTE writes in which PTEs can be
1358	* cleared temporarily before being written new present
1359	* values. This will lead to a large folio is still
1360	* mapped while some subpages have been partially
1361	* unmapped after try_to_unmap; TTU_SYNC helps
1362	* try_to_unmap acquire PTL from the first PTE,
1363	* eliminating the influence of temporary PTE values.
1364	*/
1365	if (folio_test_large(folio))
1366	flags \|= TTU_SYNC;
1367
1368	try_to_unmap(folio, flags);
1369	if (folio_mapped(folio)) {
1370	stat->nr_unmap_fail += nr_pages;
1371	if (!was_swapbacked &&
1372	folio_test_swapbacked(folio))
1373	stat->nr_lazyfree_fail += nr_pages;
1374	goto activate_locked;
1375	}
1376	}
1377
1378	/*
1379	* Folio is unmapped now so it cannot be newly pinned anymore.
1380	* No point in trying to reclaim folio if it is pinned.
1381	* Furthermore we don't want to reclaim underlying fs metadata
1382	* if the folio is pinned and thus potentially modified by the
1383	* pinning process as that may upset the filesystem.
1384	*/
1385	if (folio_maybe_dma_pinned(folio))
1386	goto activate_locked;
1387
1388	mapping = folio_mapping(folio);
1389	if (folio_test_dirty(folio)) {
1390	if (folio_is_file_lru(folio)) {
1391	/*
1392	* Immediately reclaim when written back.
1393	* Similar in principle to folio_deactivate()
1394	* except we already have the folio isolated
1395	* and know it's dirty
1396	*/
1397	node_stat_mod_folio(folio, item: NR_VMSCAN_IMMEDIATE,
1398	nr: nr_pages);
1399	if (!folio_test_reclaim(folio))
1400	folio_set_reclaim(folio);
1401
1402	goto activate_locked;
1403	}
1404
1405	if (references == FOLIOREF_RECLAIM_CLEAN)
1406	goto keep_locked;
1407	if (!may_enter_fs(folio, gfp_mask: sc->gfp_mask))
1408	goto keep_locked;
1409	if (!sc->may_writepage)
1410	goto keep_locked;
1411
1412	/*
1413	* Folio is dirty. Flush the TLB if a writable entry
1414	* potentially exists to avoid CPU writes after I/O
1415	* starts and then write it out here.
1416	*/
1417	try_to_unmap_flush_dirty();
1418	switch (pageout(folio, mapping, plug: &plug, folio_list)) {
1419	case PAGE_KEEP:
1420	goto keep_locked;
1421	case PAGE_ACTIVATE:
1422	/*
1423	* If shmem folio is split when writeback to swap,
1424	* the tail pages will make their own pass through
1425	* this function and be accounted then.
1426	*/
1427	if (nr_pages > `1` && !folio_test_large(folio)) {
1428	sc->nr_scanned -= (nr_pages - `1`);
1429	nr_pages = `1`;
1430	}
1431	goto activate_locked;
1432	case PAGE_SUCCESS:
1433	if (nr_pages > `1` && !folio_test_large(folio)) {
1434	sc->nr_scanned -= (nr_pages - `1`);
1435	nr_pages = `1`;
1436	}
1437	stat->nr_pageout += nr_pages;
1438
1439	if (folio_test_writeback(folio))
1440	goto keep;
1441	if (folio_test_dirty(folio))
1442	goto keep;
1443
1444	/*
1445	* A synchronous write - probably a ramdisk. Go
1446	* ahead and try to reclaim the folio.
1447	*/
1448	if (!folio_trylock(folio))
1449	goto keep;
1450	if (folio_test_dirty(folio) \|\|
1451	folio_test_writeback(folio))
1452	goto keep_locked;
1453	mapping = folio_mapping(folio);
1454	fallthrough;
1455	case PAGE_CLEAN:
1456	; / try to free the folio below /
1457	}
1458	}
1459
1460	/*
1461	* If the folio has buffers, try to free the buffer
1462	* mappings associated with this folio. If we succeed
1463	* we try to free the folio as well.
1464	*
1465	* We do this even if the folio is dirty.
1466	* filemap_release_folio() does not perform I/O, but it
1467	* is possible for a folio to have the dirty flag set,
1468	* but it is actually clean (all its buffers are clean).
1469	* This happens if the buffers were written out directly,
1470	* with submit_bh(). ext3 will do this, as well as
1471	* the blockdev mapping. filemap_release_folio() will
1472	* discover that cleanness and will drop the buffers
1473	* and mark the folio clean - it can be freed.
1474	*
1475	* Rarely, folios can have buffers and no ->mapping.
1476	* These are the folios which were not successfully
1477	* invalidated in truncate_cleanup_folio(). We try to
1478	* drop those buffers here and if that worked, and the
1479	* folio is no longer mapped into process address space
1480	* (refcount == 1) it can be freed. Otherwise, leave
1481	* the folio on the LRU so it is swappable.
1482	*/
1483	if (folio_needs_release(folio)) {
1484	if (!filemap_release_folio(folio, gfp: sc->gfp_mask))
1485	goto activate_locked;
1486	if (!mapping && folio_ref_count(folio) == `1`) {
1487	folio_unlock(folio);
1488	if (folio_put_testzero(folio))
1489	goto free_it;
1490	else {
1491	/*
1492	* rare race with speculative reference.
1493	* the speculative reference will free
1494	* this folio shortly, so we may
1495	* increment nr_reclaimed here (and
1496	* leave it off the LRU).
1497	*/
1498	nr_reclaimed += nr_pages;
1499	continue;
1500	}
1501	}
1502	}
1503
1504	if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1505	/ follow __remove_mapping for reference /
1506	if (!folio_ref_freeze(folio, count: `1`))
1507	goto keep_locked;
1508	/*
1509	* The folio has only one reference left, which is
1510	* from the isolation. After the caller puts the
1511	* folio back on the lru and drops the reference, the
1512	* folio will be freed anyway. It doesn't matter
1513	* which lru it goes on. So we don't bother checking
1514	* the dirty flag here.
1515	*/
1516	count_vm_events(item: PGLAZYFREED, delta: nr_pages);
1517	count_memcg_folio_events(folio, idx: PGLAZYFREED, nr: nr_pages);
1518	} else if (!mapping \|\| !__remove_mapping(mapping, folio, reclaimed: true,
1519	target_memcg: sc->target_mem_cgroup))
1520	goto keep_locked;
1521
1522	folio_unlock(folio);
1523	free_it:
1524	/*
1525	* Folio may get swapped out as a whole, need to account
1526	* all pages in it.
1527	*/
1528	nr_reclaimed += nr_pages;
1529
1530	folio_unqueue_deferred_split(folio);
1531	if (folio_batch_add(fbatch: &free_folios, folio) == `0`) {
1532	mem_cgroup_uncharge_folios(folios: &free_folios);
1533	try_to_unmap_flush();
1534	free_unref_folios(fbatch: &free_folios);
1535	}
1536	continue;
1537
1538	activate_locked_split:
1539	/*
1540	* The tail pages that are failed to add into swap cache
1541	* reach here. Fixup nr_scanned and nr_pages.
1542	*/
1543	if (nr_pages > `1`) {
1544	sc->nr_scanned -= (nr_pages - `1`);
1545	nr_pages = `1`;
1546	}
1547	activate_locked:
1548	/ Not a candidate for swapping, so reclaim swap space. /
1549	if (folio_test_swapcache(folio) &&
1550	(mem_cgroup_swap_full(folio) \|\| folio_test_mlocked(folio)))
1551	folio_free_swap(folio);
1552	VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1553	if (!folio_test_mlocked(folio)) {
1554	int type = folio_is_file_lru(folio);
1555	folio_set_active(folio);
1556	stat->nr_activate[type] += nr_pages;
1557	count_memcg_folio_events(folio, idx: PGACTIVATE, nr: nr_pages);
1558	}
1559	keep_locked:
1560	folio_unlock(folio);
1561	keep:
1562	list_add(new: &folio->lru, head: &ret_folios);
1563	VM_BUG_ON_FOLIO(folio_test_lru(folio) \|\|
1564	folio_test_unevictable(folio), folio);
1565	}
1566	/ 'folio_list' is always empty here /
1567
1568	/ Migrate folios selected for demotion /
1569	nr_demoted = demote_folio_list(demote_folios: &demote_folios, pgdat);
1570	nr_reclaimed += nr_demoted;
1571	stat->nr_demoted += nr_demoted;
1572	/ Folios that could not be demoted are still in @demote_folios /
1573	if (!list_empty(head: &demote_folios)) {
1574	/ Folios which weren't demoted go back on @folio_list /
1575	list_splice_init(list: &demote_folios, head: folio_list);
1576
1577	/*
1578	* goto retry to reclaim the undemoted folios in folio_list if
1579	* desired.
1580	*
1581	* Reclaiming directly from top tier nodes is not often desired
1582	* due to it breaking the LRU ordering: in general memory
1583	* should be reclaimed from lower tier nodes and demoted from
1584	* top tier nodes.
1585	*
1586	* However, disabling reclaim from top tier nodes entirely
1587	* would cause ooms in edge scenarios where lower tier memory
1588	* is unreclaimable for whatever reason, eg memory being
1589	* mlocked or too hot to reclaim. We can disable reclaim
1590	* from top tier nodes in proactive reclaim though as that is
1591	* not real memory pressure.
1592	*/
1593	if (!sc->proactive) {
1594	do_demote_pass = false;
1595	goto retry;
1596	}
1597	}
1598
1599	pgactivate = stat->nr_activate[`0`] + stat->nr_activate[`1`];
1600
1601	mem_cgroup_uncharge_folios(folios: &free_folios);
1602	try_to_unmap_flush();
1603	free_unref_folios(fbatch: &free_folios);
1604
1605	list_splice(list: &ret_folios, head: folio_list);
1606	count_vm_events(item: PGACTIVATE, delta: pgactivate);
1607
1608	if (plug)
1609	swap_write_unplug(sio: plug);
1610	return nr_reclaimed;
1611	}
1612
1613	unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1614	struct list_head *folio_list)
1615	{
1616	struct scan_control sc = {
1617	.gfp_mask = GFP_KERNEL,
1618	.may_unmap = `1`,
1619	};
1620	struct reclaim_stat stat;
1621	unsigned int nr_reclaimed;
1622	struct folio folio, next;
1623	LIST_HEAD(clean_folios);
1624	unsigned int noreclaim_flag;
1625
1626	list_for_each_entry_safe(folio, next, folio_list, lru) {
1627	/ TODO: these pages should not even appear in this list. /
1628	if (page_has_movable_ops(page: &folio->page))
1629	continue;
1630	if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
1631	!folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
1632	folio_clear_active(folio);
1633	list_move(list: &folio->lru, head: &clean_folios);
1634	}
1635	}
1636
1637	/*
1638	* We should be safe here since we are only dealing with file pages and
1639	* we are not kswapd and therefore cannot write dirty file pages. But
1640	* call memalloc_noreclaim_save() anyway, just in case these conditions
1641	* change in the future.
1642	*/
1643	noreclaim_flag = memalloc_noreclaim_save();
1644	nr_reclaimed = shrink_folio_list(folio_list: &clean_folios, pgdat: zone->zone_pgdat, sc: &sc,
1645	stat: &stat, ignore_references: true, NULL);
1646	memalloc_noreclaim_restore(flags: noreclaim_flag);
1647
1648	list_splice(list: &clean_folios, head: folio_list);
1649	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1650	-(long)nr_reclaimed);
1651	/*
1652	* Since lazyfree pages are isolated from file LRU from the beginning,
1653	* they will rotate back to anonymous LRU in the end if it failed to
1654	* discard so isolated count will be mismatched.
1655	* Compensate the isolated count for both LRU lists.
1656	*/
1657	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
1658	stat.nr_lazyfree_fail);
1659	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1660	-(long)stat.nr_lazyfree_fail);
1661	return nr_reclaimed;
1662	}
1663
1664	/*
1665	* Update LRU sizes after isolating pages. The LRU size updates must
1666	* be complete before mem_cgroup_update_lru_size due to a sanity check.
1667	*/
1668	static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1669	enum lru_list lru, unsigned long *nr_zone_taken)
1670	{
1671	int zid;
1672
1673	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
1674	if (!nr_zone_taken[zid])
1675	continue;
1676
1677	update_lru_size(lruvec, lru, zid, nr_pages: -nr_zone_taken[zid]);
1678	}
1679
1680	}
1681
1682	/*
1683	* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
1684	*
1685	* lruvec->lru_lock is heavily contended. Some of the functions that
1686	* shrink the lists perform better by taking out a batch of pages
1687	* and working on them outside the LRU lock.
1688	*
1689	* For pagecache intensive workloads, this function is the hottest
1690	* spot in the kernel (apart from copy_*_user functions).
1691	*
1692	* Lru_lock must be held before calling this function.
1693	*
1694	* @nr_to_scan: The number of eligible pages to look through on the list.
1695	* @lruvec: The LRU vector to pull pages from.
1696	* @dst: The temp list to put pages on to.
1697	* @nr_scanned: The number of pages that were scanned.
1698	* @sc: The scan_control struct for this reclaim session
1699	* @lru: LRU list id for isolating
1700	*
1701	* returns how many pages were moved onto *@dst.
1702	*/
1703	static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
1704	struct lruvec lruvec, struct* list_head *dst,
1705	unsigned long nr_scanned, struct* scan_control *sc,
1706	enum lru_list lru)
1707	{
1708	struct list_head *src = &lruvec->lists[lru];
1709	unsigned long nr_taken = `0`;
1710	unsigned long nr_zone_taken[MAX_NR_ZONES] = { `0` };
1711	unsigned long nr_skipped[MAX_NR_ZONES] = { `0`, };
1712	unsigned long skipped = `0`, total_scan = `0`, scan = `0`;
1713	unsigned long nr_pages;
1714	unsigned long max_nr_skipped = `0`;
1715	LIST_HEAD(folios_skipped);
1716
1717	while (scan < nr_to_scan && !list_empty(head: src)) {
1718	struct list_head *move_to = src;
1719	struct folio *folio;
1720
1721	folio = lru_to_folio(head: src);
1722	prefetchw_prev_lru_folio(folio, src, flags);
1723
1724	nr_pages = folio_nr_pages(folio);
1725	total_scan += nr_pages;
1726
1727	/ Using max_nr_skipped to prevent hard LOCKUP/
1728	if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
1729	(folio_zonenum(folio) > sc->reclaim_idx)) {
1730	nr_skipped[folio_zonenum(folio)] += nr_pages;
1731	move_to = &folios_skipped;
1732	max_nr_skipped++;
1733	goto move;
1734	}
1735
1736	/*
1737	* Do not count skipped folios because that makes the function
1738	* return with no isolated folios if the LRU mostly contains
1739	* ineligible folios. This causes the VM to not reclaim any
1740	* folios, triggering a premature OOM.
1741	* Account all pages in a folio.
1742	*/
1743	scan += nr_pages;
1744
1745	if (!folio_test_lru(folio))
1746	goto move;
1747	if (!sc->may_unmap && folio_mapped(folio))
1748	goto move;
1749
1750	/*
1751	* Be careful not to clear the lru flag until after we're
1752	* sure the folio is not being freed elsewhere -- the
1753	* folio release code relies on it.
1754	*/
1755	if (unlikely(!folio_try_get(folio)))
1756	goto move;
1757
1758	if (!folio_test_clear_lru(folio)) {
1759	/ Another thread is already isolating this folio /
1760	folio_put(folio);
1761	goto move;
1762	}
1763
1764	nr_taken += nr_pages;
1765	nr_zone_taken[folio_zonenum(folio)] += nr_pages;
1766	move_to = dst;
1767	move:
1768	list_move(list: &folio->lru, head: move_to);
1769	}
1770
1771	/*
1772	* Splice any skipped folios to the start of the LRU list. Note that
1773	* this disrupts the LRU order when reclaiming for lower zones but
1774	* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1775	* scanning would soon rescan the same folios to skip and waste lots
1776	* of cpu cycles.
1777	*/
1778	if (!list_empty(head: &folios_skipped)) {
1779	int zid;
1780
1781	list_splice(list: &folios_skipped, head: src);
1782	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
1783	if (!nr_skipped[zid])
1784	continue;
1785
1786	__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1787	skipped += nr_skipped[zid];
1788	}
1789	}
1790	*nr_scanned = total_scan;
1791	trace_mm_vmscan_lru_isolate(highest_zoneidx: sc->reclaim_idx, order: sc->order, nr_requested: nr_to_scan,
1792	nr_scanned: total_scan, nr_skipped: skipped, nr_taken, lru);
1793	update_lru_sizes(lruvec, lru, nr_zone_taken);
1794	return nr_taken;
1795	}
1796
1797	/**
1798	* folio_isolate_lru() - Try to isolate a folio from its LRU list.
1799	* @folio: Folio to isolate from its LRU list.
1800	*
1801	* Isolate a @folio from an LRU list and adjust the vmstat statistic
1802	* corresponding to whatever LRU list the folio was on.
1803	*
1804	* The folio will have its LRU flag cleared. If it was found on the
1805	* active list, it will have the Active flag set. If it was found on the
1806	* unevictable list, it will have the Unevictable flag set. These flags
1807	* may need to be cleared by the caller before letting the page go.
1808	*
1809	* Context:
1810	*
1811	* (1) Must be called with an elevated refcount on the folio. This is a
1812	* fundamental difference from isolate_lru_folios() (which is called
1813	* without a stable reference).
1814	* (2) The lru_lock must not be held.
1815	* (3) Interrupts must be enabled.
1816	*
1817	* Return: true if the folio was removed from an LRU list.
1818	* false if the folio was not on an LRU list.
1819	*/
1820	bool folio_isolate_lru(struct folio *folio)
1821	{
1822	bool ret = false;
1823
1824	VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
1825
1826	if (folio_test_clear_lru(folio)) {
1827	struct lruvec *lruvec;
1828
1829	folio_get(folio);
1830	lruvec = folio_lruvec_lock_irq(folio);
1831	lruvec_del_folio(lruvec, folio);
1832	unlock_page_lruvec_irq(lruvec);
1833	ret = true;
1834	}
1835
1836	return ret;
1837	}
1838
1839	/*
1840	* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1841	* then get rescheduled. When there are massive number of tasks doing page
1842	* allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1843	* the LRU list will go small and be scanned faster than necessary, leading to
1844	* unnecessary swapping, thrashing and OOM.
1845	*/
1846	static bool too_many_isolated(struct pglist_data pgdat, int* file,
1847	struct scan_control *sc)
1848	{
1849	unsigned long inactive, isolated;
1850	bool too_many;
1851
1852	if (current_is_kswapd())
1853	return false;
1854
1855	if (!writeback_throttling_sane(sc))
1856	return false;
1857
1858	if (file) {
1859	inactive = node_page_state(pgdat, item: NR_INACTIVE_FILE);
1860	isolated = node_page_state(pgdat, item: NR_ISOLATED_FILE);
1861	} else {
1862	inactive = node_page_state(pgdat, item: NR_INACTIVE_ANON);
1863	isolated = node_page_state(pgdat, item: NR_ISOLATED_ANON);
1864	}
1865
1866	/*
1867	* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1868	* won't get blocked by normal direct-reclaimers, forming a circular
1869	* deadlock.
1870	*/
1871	if (gfp_has_io_fs(gfp: sc->gfp_mask))
1872	inactive >>= `3`;
1873
1874	too_many = isolated > inactive;
1875
1876	/ Wake up tasks throttled due to too_many_isolated. /
1877	if (!too_many)
1878	wake_throttle_isolated(pgdat);
1879
1880	return too_many;
1881	}
1882
1883	/*
1884	* move_folios_to_lru() moves folios from private @list to appropriate LRU list.
1885	*
1886	* Returns the number of pages moved to the given lruvec.
1887	*/
1888	static unsigned int move_folios_to_lru(struct lruvec *lruvec,
1889	struct list_head *list)
1890	{
1891	int nr_pages, nr_moved = `0`;
1892	struct folio_batch free_folios;
1893
1894	folio_batch_init(fbatch: &free_folios);
1895	while (!list_empty(head: list)) {
1896	struct folio *folio = lru_to_folio(head: list);
1897
1898	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
1899	list_del(entry: &folio->lru);
1900	if (unlikely(!folio_evictable(folio))) {
1901	spin_unlock_irq(lock: &lruvec->lru_lock);
1902	folio_putback_lru(folio);
1903	spin_lock_irq(lock: &lruvec->lru_lock);
1904	continue;
1905	}
1906
1907	/*
1908	* The folio_set_lru needs to be kept here for list integrity.
1909	* Otherwise:
1910	* #0 move_folios_to_lru #1 release_pages
1911	* if (!folio_put_testzero())
1912	* if (folio_put_testzero())
1913	* !lru //skip lru_lock
1914	* folio_set_lru()
1915	* list_add(&folio->lru,)
1916	* list_add(&folio->lru,)
1917	*/
1918	folio_set_lru(folio);
1919
1920	if (unlikely(folio_put_testzero(folio))) {
1921	__folio_clear_lru_flags(folio);
1922
1923	folio_unqueue_deferred_split(folio);
1924	if (folio_batch_add(fbatch: &free_folios, folio) == `0`) {
1925	spin_unlock_irq(lock: &lruvec->lru_lock);
1926	mem_cgroup_uncharge_folios(folios: &free_folios);
1927	free_unref_folios(fbatch: &free_folios);
1928	spin_lock_irq(lock: &lruvec->lru_lock);
1929	}
1930
1931	continue;
1932	}
1933
1934	/*
1935	* All pages were isolated from the same lruvec (and isolation
1936	* inhibits memcg migration).
1937	*/
1938	VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
1939	lruvec_add_folio(lruvec, folio);
1940	nr_pages = folio_nr_pages(folio);
1941	nr_moved += nr_pages;
1942	if (folio_test_active(folio))
1943	workingset_age_nonresident(lruvec, nr_pages);
1944	}
1945
1946	if (free_folios.nr) {
1947	spin_unlock_irq(lock: &lruvec->lru_lock);
1948	mem_cgroup_uncharge_folios(folios: &free_folios);
1949	free_unref_folios(fbatch: &free_folios);
1950	spin_lock_irq(lock: &lruvec->lru_lock);
1951	}
1952
1953	return nr_moved;
1954	}
1955
1956	/*
1957	* If a kernel thread (such as nfsd for loop-back mounts) services a backing
1958	* device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
1959	* we should not throttle. Otherwise it is safe to do so.
1960	*/
1961	static int current_may_throttle(void)
1962	{
1963	return !(current->flags & PF_LOCAL_THROTTLE);
1964	}
1965
1966	/*
1967	* shrink_inactive_list() is a helper for shrink_node(). It returns the number
1968	* of reclaimed pages
1969	*/
1970	static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
1971	struct lruvec lruvec, struct* scan_control *sc,
1972	enum lru_list lru)
1973	{
1974	LIST_HEAD(folio_list);
1975	unsigned long nr_scanned;
1976	unsigned int nr_reclaimed = `0`;
1977	unsigned long nr_taken;
1978	struct reclaim_stat stat;
1979	bool file = is_file_lru(lru);
1980	enum vm_event_item item;
1981	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1982	bool stalled = false;
1983
1984	while (unlikely(too_many_isolated(pgdat, file, sc))) {
1985	if (stalled)
1986	return `0`;
1987
1988	/ wait a bit for the reclaimer. /
1989	stalled = true;
1990	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_ISOLATED);
1991
1992	/ We are about to die and free our memory. Return now. /
1993	if (fatal_signal_pending(current))
1994	return SWAP_CLUSTER_MAX;
1995	}
1996
1997	lru_add_drain();
1998
1999	spin_lock_irq(lock: &lruvec->lru_lock);
2000
2001	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, dst: &folio_list,
2002	nr_scanned: &nr_scanned, sc, lru);
2003
2004	__mod_node_page_state(pgdat, item: NR_ISOLATED_ANON + file, nr_taken);
2005	item = PGSCAN_KSWAPD + reclaimer_offset(sc);
2006	if (!cgroup_reclaim(sc))
2007	__count_vm_events(item, delta: nr_scanned);
2008	count_memcg_events(memcg: lruvec_memcg(lruvec), idx: item, count: nr_scanned);
2009	__count_vm_events(item: PGSCAN_ANON + file, delta: nr_scanned);
2010
2011	spin_unlock_irq(lock: &lruvec->lru_lock);
2012
2013	if (nr_taken == `0`)
2014	return `0`;
2015
2016	nr_reclaimed = shrink_folio_list(folio_list: &folio_list, pgdat, sc, stat: &stat, ignore_references: false,
2017	memcg: lruvec_memcg(lruvec));
2018
2019	spin_lock_irq(lock: &lruvec->lru_lock);
2020	move_folios_to_lru(lruvec, list: &folio_list);
2021
2022	mod_lruvec_state(lruvec, idx: PGDEMOTE_KSWAPD + reclaimer_offset(sc),
2023	val: stat.nr_demoted);
2024	__mod_node_page_state(pgdat, item: NR_ISOLATED_ANON + file, -nr_taken);
2025	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
2026	if (!cgroup_reclaim(sc))
2027	__count_vm_events(item, delta: nr_reclaimed);
2028	count_memcg_events(memcg: lruvec_memcg(lruvec), idx: item, count: nr_reclaimed);
2029	__count_vm_events(item: PGSTEAL_ANON + file, delta: nr_reclaimed);
2030
2031	lru_note_cost_unlock_irq(lruvec, file, nr_io: stat.nr_pageout,
2032	nr_rotated: nr_scanned - nr_reclaimed);
2033
2034	/*
2035	* If dirty folios are scanned that are not queued for IO, it
2036	* implies that flushers are not doing their job. This can
2037	* happen when memory pressure pushes dirty folios to the end of
2038	* the LRU before the dirty limits are breached and the dirty
2039	* data has expired. It can also happen when the proportion of
2040	* dirty folios grows not through writes but through memory
2041	* pressure reclaiming all the clean cache. And in some cases,
2042	* the flushers simply cannot keep up with the allocation
2043	* rate. Nudge the flusher threads in case they are asleep.
2044	*/
2045	if (stat.nr_unqueued_dirty == nr_taken) {
2046	wakeup_flusher_threads(reason: WB_REASON_VMSCAN);
2047	/*
2048	* For cgroupv1 dirty throttling is achieved by waking up
2049	* the kernel flusher here and later waiting on folios
2050	* which are in writeback to finish (see shrink_folio_list()).
2051	*
2052	* Flusher may not be able to issue writeback quickly
2053	* enough for cgroupv1 writeback throttling to work
2054	* on a large system.
2055	*/
2056	if (!writeback_throttling_sane(sc))
2057	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_WRITEBACK);
2058	}
2059
2060	sc->nr.dirty += stat.nr_dirty;
2061	sc->nr.congested += stat.nr_congested;
2062	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2063	sc->nr.writeback += stat.nr_writeback;
2064	sc->nr.immediate += stat.nr_immediate;
2065	sc->nr.taken += nr_taken;
2066	if (file)
2067	sc->nr.file_taken += nr_taken;
2068
2069	trace_mm_vmscan_lru_shrink_inactive(nid: pgdat->node_id,
2070	nr_scanned, nr_reclaimed, stat: &stat, priority: sc->priority, file);
2071	return nr_reclaimed;
2072	}
2073
2074	/*
2075	* shrink_active_list() moves folios from the active LRU to the inactive LRU.
2076	*
2077	* We move them the other way if the folio is referenced by one or more
2078	* processes.
2079	*
2080	* If the folios are mostly unmapped, the processing is fast and it is
2081	* appropriate to hold lru_lock across the whole operation. But if
2082	* the folios are mapped, the processing is slow (folio_referenced()), so
2083	* we should drop lru_lock around each folio. It's impossible to balance
2084	* this, so instead we remove the folios from the LRU while processing them.
2085	* It is safe to rely on the active flag against the non-LRU folios in here
2086	* because nobody will play with that bit on a non-LRU folio.
2087	*
2088	* The downside is that we have to touch folio->_refcount against each folio.
2089	* But we had to alter folio->flags anyway.
2090	*/
2091	static void shrink_active_list(unsigned long nr_to_scan,
2092	struct lruvec *lruvec,
2093	struct scan_control *sc,
2094	enum lru_list lru)
2095	{
2096	unsigned long nr_taken;
2097	unsigned long nr_scanned;
2098	vm_flags_t vm_flags;
2099	LIST_HEAD(l_hold); / The folios which were snipped off /
2100	LIST_HEAD(l_active);
2101	LIST_HEAD(l_inactive);
2102	unsigned nr_deactivate, nr_activate;
2103	unsigned nr_rotated = `0`;
2104	bool file = is_file_lru(lru);
2105	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2106
2107	lru_add_drain();
2108
2109	spin_lock_irq(lock: &lruvec->lru_lock);
2110
2111	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, dst: &l_hold,
2112	nr_scanned: &nr_scanned, sc, lru);
2113
2114	__mod_node_page_state(pgdat, item: NR_ISOLATED_ANON + file, nr_taken);
2115
2116	if (!cgroup_reclaim(sc))
2117	__count_vm_events(item: PGREFILL, delta: nr_scanned);
2118	count_memcg_events(memcg: lruvec_memcg(lruvec), idx: PGREFILL, count: nr_scanned);
2119
2120	spin_unlock_irq(lock: &lruvec->lru_lock);
2121
2122	while (!list_empty(head: &l_hold)) {
2123	struct folio *folio;
2124
2125	cond_resched();
2126	folio = lru_to_folio(head: &l_hold);
2127	list_del(entry: &folio->lru);
2128
2129	if (unlikely(!folio_evictable(folio))) {
2130	folio_putback_lru(folio);
2131	continue;
2132	}
2133
2134	if (unlikely(buffer_heads_over_limit)) {
2135	if (folio_needs_release(folio) &&
2136	folio_trylock(folio)) {
2137	filemap_release_folio(folio, gfp: `0`);
2138	folio_unlock(folio);
2139	}
2140	}
2141
2142	/ Referenced or rmap lock contention: rotate /
2143	if (folio_referenced(folio, is_locked: `0`, memcg: sc->target_mem_cgroup,
2144	vm_flags: &vm_flags) != `0`) {
2145	/*
2146	* Identify referenced, file-backed active folios and
2147	* give them one more trip around the active list. So
2148	* that executable code get better chances to stay in
2149	* memory under moderate memory pressure. Anon folios
2150	* are not likely to be evicted by use-once streaming
2151	* IO, plus JVM can create lots of anon VM_EXEC folios,
2152	* so we ignore them here.
2153	*/
2154	if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
2155	nr_rotated += folio_nr_pages(folio);
2156	list_add(new: &folio->lru, head: &l_active);
2157	continue;
2158	}
2159	}
2160
2161	folio_clear_active(folio); / we are de-activating /
2162	folio_set_workingset(folio);
2163	list_add(new: &folio->lru, head: &l_inactive);
2164	}
2165
2166	/*
2167	* Move folios back to the lru list.
2168	*/
2169	spin_lock_irq(lock: &lruvec->lru_lock);
2170
2171	nr_activate = move_folios_to_lru(lruvec, list: &l_active);
2172	nr_deactivate = move_folios_to_lru(lruvec, list: &l_inactive);
2173
2174	__count_vm_events(item: PGDEACTIVATE, delta: nr_deactivate);
2175	count_memcg_events(memcg: lruvec_memcg(lruvec), idx: PGDEACTIVATE, count: nr_deactivate);
2176
2177	__mod_node_page_state(pgdat, item: NR_ISOLATED_ANON + file, -nr_taken);
2178
2179	lru_note_cost_unlock_irq(lruvec, file, nr_io: `0`, nr_rotated);
2180	trace_mm_vmscan_lru_shrink_active(nid: pgdat->node_id, nr_taken, nr_active: nr_activate,
2181	nr_deactivated: nr_deactivate, nr_referenced: nr_rotated, priority: sc->priority, file);
2182	}
2183
2184	static unsigned int reclaim_folio_list(struct list_head *folio_list,
2185	struct pglist_data *pgdat)
2186	{
2187	struct reclaim_stat stat;
2188	unsigned int nr_reclaimed;
2189	struct folio *folio;
2190	struct scan_control sc = {
2191	.gfp_mask = GFP_KERNEL,
2192	.may_writepage = `1`,
2193	.may_unmap = `1`,
2194	.may_swap = `1`,
2195	.no_demotion = `1`,
2196	};
2197
2198	nr_reclaimed = shrink_folio_list(folio_list, pgdat, sc: &sc, stat: &stat, ignore_references: true, NULL);
2199	while (!list_empty(head: folio_list)) {
2200	folio = lru_to_folio(head: folio_list);
2201	list_del(entry: &folio->lru);
2202	folio_putback_lru(folio);
2203	}
2204	trace_mm_vmscan_reclaim_pages(nid: pgdat->node_id, nr_scanned: sc.nr_scanned, nr_reclaimed, stat: &stat);
2205
2206	return nr_reclaimed;
2207	}
2208
2209	unsigned long reclaim_pages(struct list_head *folio_list)
2210	{
2211	int nid;
2212	unsigned int nr_reclaimed = `0`;
2213	LIST_HEAD(node_folio_list);
2214	unsigned int noreclaim_flag;
2215
2216	if (list_empty(head: folio_list))
2217	return nr_reclaimed;
2218
2219	noreclaim_flag = memalloc_noreclaim_save();
2220
2221	nid = folio_nid(folio: lru_to_folio(head: folio_list));
2222	do {
2223	struct folio *folio = lru_to_folio(head: folio_list);
2224
2225	if (nid == folio_nid(folio)) {
2226	folio_clear_active(folio);
2227	list_move(list: &folio->lru, head: &node_folio_list);
2228	continue;
2229	}
2230
2231	nr_reclaimed += reclaim_folio_list(folio_list: &node_folio_list, NODE_DATA(nid));
2232	nid = folio_nid(folio: lru_to_folio(head: folio_list));
2233	} while (!list_empty(head: folio_list));
2234
2235	nr_reclaimed += reclaim_folio_list(folio_list: &node_folio_list, NODE_DATA(nid));
2236
2237	memalloc_noreclaim_restore(flags: noreclaim_flag);
2238
2239	return nr_reclaimed;
2240	}
2241
2242	static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2243	struct lruvec lruvec, struct* scan_control *sc)
2244	{
2245	if (is_active_lru(lru)) {
2246	if (sc->may_deactivate & (`1` << is_file_lru(lru)))
2247	shrink_active_list(nr_to_scan, lruvec, sc, lru);
2248	else
2249	sc->skipped_deactivate = `1`;
2250	return `0`;
2251	}
2252
2253	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2254	}
2255
2256	/*
2257	* The inactive anon list should be small enough that the VM never has
2258	* to do too much work.
2259	*
2260	* The inactive file list should be small enough to leave most memory
2261	* to the established workingset on the scan-resistant active list,
2262	* but large enough to avoid thrashing the aggregate readahead window.
2263	*
2264	* Both inactive lists should also be large enough that each inactive
2265	* folio has a chance to be referenced again before it is reclaimed.
2266	*
2267	* If that fails and refaulting is observed, the inactive list grows.
2268	*
2269	* The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
2270	* on this LRU, maintained by the pageout code. An inactive_ratio
2271	* of 3 means 3:1 or 25% of the folios are kept on the inactive list.
2272	*
2273	* total target max
2274	* memory ratio inactive
2275	* -------------------------------------
2276	* 10MB 1 5MB
2277	* 100MB 1 50MB
2278	* 1GB 3 250MB
2279	* 10GB 10 0.9GB
2280	* 100GB 31 3GB
2281	* 1TB 101 10GB
2282	* 10TB 320 32GB
2283	*/
2284	static bool inactive_is_low(struct lruvec lruvec, enum* lru_list inactive_lru)
2285	{
2286	enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2287	unsigned long inactive, active;
2288	unsigned long inactive_ratio;
2289	unsigned long gb;
2290
2291	inactive = lruvec_page_state(lruvec, idx: NR_LRU_BASE + inactive_lru);
2292	active = lruvec_page_state(lruvec, idx: NR_LRU_BASE + active_lru);
2293
2294	gb = (inactive + active) >> (`30` - PAGE_SHIFT);
2295	if (gb)
2296	inactive_ratio = int_sqrt(`10` * gb);
2297	else
2298	inactive_ratio = `1`;
2299
2300	return inactive * inactive_ratio < active;
2301	}
2302
2303	enum scan_balance {
2304	SCAN_EQUAL,
2305	SCAN_FRACT,
2306	SCAN_ANON,
2307	SCAN_FILE,
2308	};
2309
2310	static void prepare_scan_control(pg_data_t pgdat, struct* scan_control *sc)
2311	{
2312	unsigned long file;
2313	struct lruvec *target_lruvec;
2314
2315	if (lru_gen_enabled())
2316	return;
2317
2318	target_lruvec = mem_cgroup_lruvec(memcg: sc->target_mem_cgroup, pgdat);
2319
2320	/*
2321	* Flush the memory cgroup stats in rate-limited way as we don't need
2322	* most accurate stats here. We may switch to regular stats flushing
2323	* in the future once it is cheap enough.
2324	*/
2325	mem_cgroup_flush_stats_ratelimited(memcg: sc->target_mem_cgroup);
2326
2327	/*
2328	* Determine the scan balance between anon and file LRUs.
2329	*/
2330	spin_lock_irq(lock: &target_lruvec->lru_lock);
2331	sc->anon_cost = target_lruvec->anon_cost;
2332	sc->file_cost = target_lruvec->file_cost;
2333	spin_unlock_irq(lock: &target_lruvec->lru_lock);
2334
2335	/*
2336	* Target desirable inactive:active list ratios for the anon
2337	* and file LRU lists.
2338	*/
2339	if (!sc->force_deactivate) {
2340	unsigned long refaults;
2341
2342	/*
2343	* When refaults are being observed, it means a new
2344	* workingset is being established. Deactivate to get
2345	* rid of any stale active pages quickly.
2346	*/
2347	refaults = lruvec_page_state(lruvec: target_lruvec,
2348	idx: WORKINGSET_ACTIVATE_ANON);
2349	if (refaults != target_lruvec->refaults[WORKINGSET_ANON] \|\|
2350	inactive_is_low(lruvec: target_lruvec, inactive_lru: LRU_INACTIVE_ANON))
2351	sc->may_deactivate \|= DEACTIVATE_ANON;
2352	else
2353	sc->may_deactivate &= ~DEACTIVATE_ANON;
2354
2355	refaults = lruvec_page_state(lruvec: target_lruvec,
2356	idx: WORKINGSET_ACTIVATE_FILE);
2357	if (refaults != target_lruvec->refaults[WORKINGSET_FILE] \|\|
2358	inactive_is_low(lruvec: target_lruvec, inactive_lru: LRU_INACTIVE_FILE))
2359	sc->may_deactivate \|= DEACTIVATE_FILE;
2360	else
2361	sc->may_deactivate &= ~DEACTIVATE_FILE;
2362	} else
2363	sc->may_deactivate = DEACTIVATE_ANON \| DEACTIVATE_FILE;
2364
2365	/*
2366	* If we have plenty of inactive file pages that aren't
2367	* thrashing, try to reclaim those first before touching
2368	* anonymous pages.
2369	*/
2370	file = lruvec_page_state(lruvec: target_lruvec, idx: NR_INACTIVE_FILE);
2371	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) &&
2372	!sc->no_cache_trim_mode)
2373	sc->cache_trim_mode = `1`;
2374	else
2375	sc->cache_trim_mode = `0`;
2376
2377	/*
2378	* Prevent the reclaimer from falling into the cache trap: as
2379	* cache pages start out inactive, every cache fault will tip
2380	* the scan balance towards the file LRU. And as the file LRU
2381	* shrinks, so does the window for rotation from references.
2382	* This means we have a runaway feedback loop where a tiny
2383	* thrashing file LRU becomes infinitely more attractive than
2384	* anon pages. Try to detect this based on file LRU size.
2385	*/
2386	if (!cgroup_reclaim(sc)) {
2387	unsigned long total_high_wmark = `0`;
2388	unsigned long free, anon;
2389	int z;
2390	struct zone *zone;
2391
2392	free = sum_zone_node_page_state(node: pgdat->node_id, item: NR_FREE_PAGES);
2393	file = node_page_state(pgdat, item: NR_ACTIVE_FILE) +
2394	node_page_state(pgdat, item: NR_INACTIVE_FILE);
2395
2396	for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - `1`) {
2397	total_high_wmark += high_wmark_pages(z: zone);
2398	}
2399
2400	/*
2401	* Consider anon: if that's low too, this isn't a
2402	* runaway file reclaim problem, but rather just
2403	* extreme pressure. Reclaim as per usual then.
2404	*/
2405	anon = node_page_state(pgdat, item: NR_INACTIVE_ANON);
2406
2407	sc->file_is_tiny =
2408	file + free <= total_high_wmark &&
2409	!(sc->may_deactivate & DEACTIVATE_ANON) &&
2410	anon >> sc->priority;
2411	}
2412	}
2413
2414	static inline void calculate_pressure_balance(struct scan_control *sc,
2415	int swappiness, u64 fraction, u64 denominator)
2416	{
2417	unsigned long anon_cost, file_cost, total_cost;
2418	unsigned long ap, fp;
2419
2420	/*
2421	* Calculate the pressure balance between anon and file pages.
2422	*
2423	* The amount of pressure we put on each LRU is inversely
2424	* proportional to the cost of reclaiming each list, as
2425	* determined by the share of pages that are refaulting, times
2426	* the relative IO cost of bringing back a swapped out
2427	* anonymous page vs reloading a filesystem page (swappiness).
2428	*
2429	* Although we limit that influence to ensure no list gets
2430	* left behind completely: at least a third of the pressure is
2431	* applied, before swappiness.
2432	*
2433	* With swappiness at 100, anon and file have equal IO cost.
2434	*/
2435	total_cost = sc->anon_cost + sc->file_cost;
2436	anon_cost = total_cost + sc->anon_cost;
2437	file_cost = total_cost + sc->file_cost;
2438	total_cost = anon_cost + file_cost;
2439
2440	ap = swappiness * (total_cost + `1`);
2441	ap /= anon_cost + `1`;
2442
2443	fp = (MAX_SWAPPINESS - swappiness) * (total_cost + `1`);
2444	fp /= file_cost + `1`;
2445
2446	fraction[WORKINGSET_ANON] = ap;
2447	fraction[WORKINGSET_FILE] = fp;
2448	*denominator = ap + fp;
2449	}
2450
2451	static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
2452	struct scan_control sc, unsigned* long scan)
2453	{
2454	unsigned long min, low;
2455
2456	mem_cgroup_protection(root: sc->target_mem_cgroup, memcg, min: &min, low: &low);
2457
2458	if (min \|\| low) {
2459	/*
2460	* Scale a cgroup's reclaim pressure by proportioning
2461	* its current usage to its memory.low or memory.min
2462	* setting.
2463	*
2464	* This is important, as otherwise scanning aggression
2465	* becomes extremely binary -- from nothing as we
2466	* approach the memory protection threshold, to totally
2467	* nominal as we exceed it. This results in requiring
2468	* setting extremely liberal protection thresholds. It
2469	* also means we simply get no protection at all if we
2470	* set it too low, which is not ideal.
2471	*
2472	* If there is any protection in place, we reduce scan
2473	* pressure by how much of the total memory used is
2474	* within protection thresholds.
2475	*
2476	* There is one special case: in the first reclaim pass,
2477	* we skip over all groups that are within their low
2478	* protection. If that fails to reclaim enough pages to
2479	* satisfy the reclaim goal, we come back and override
2480	* the best-effort low protection. However, we still
2481	* ideally want to honor how well-behaved groups are in
2482	* that case instead of simply punishing them all
2483	* equally. As such, we reclaim them based on how much
2484	* memory they are using, reducing the scan pressure
2485	* again by how much of the total memory used is under
2486	* hard protection.
2487	*/
2488	unsigned long cgroup_size = mem_cgroup_size(memcg);
2489	unsigned long protection;
2490
2491	/ memory.low scaling, make sure we retry before OOM /
2492	if (!sc->memcg_low_reclaim && low > min) {
2493	protection = low;
2494	sc->memcg_low_skipped = `1`;
2495	} else {
2496	protection = min;
2497	}
2498
2499	/ Avoid TOCTOU with earlier protection check /
2500	cgroup_size = max(cgroup_size, protection);
2501
2502	scan -= scan * protection / (cgroup_size + `1`);
2503
2504	/*
2505	* Minimally target SWAP_CLUSTER_MAX pages to keep
2506	* reclaim moving forwards, avoiding decrementing
2507	* sc->priority further than desirable.
2508	*/
2509	scan = max(scan, SWAP_CLUSTER_MAX);
2510	}
2511	return scan;
2512	}
2513
2514	/*
2515	* Determine how aggressively the anon and file LRU lists should be
2516	* scanned.
2517	*
2518	* nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
2519	* nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
2520	*/
2521	static void get_scan_count(struct lruvec lruvec, struct* scan_control *sc,
2522	unsigned long *nr)
2523	{
2524	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2525	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2526	int swappiness = sc_swappiness(sc, memcg);
2527	u64 fraction[ANON_AND_FILE];
2528	u64 denominator = `0`; / gcc /
2529	enum scan_balance scan_balance;
2530	enum lru_list lru;
2531
2532	/ If we have no swap space, do not bother scanning anon folios. /
2533	if (!sc->may_swap \|\| !can_reclaim_anon_pages(memcg, nid: pgdat->node_id, sc)) {
2534	scan_balance = SCAN_FILE;
2535	goto out;
2536	}
2537
2538	/*
2539	* Global reclaim will swap to prevent OOM even with no
2540	* swappiness, but memcg users want to use this knob to
2541	* disable swapping for individual groups completely when
2542	* using the memory controller's swap limit feature would be
2543	* too expensive.
2544	*/
2545	if (cgroup_reclaim(sc) && !swappiness) {
2546	scan_balance = SCAN_FILE;
2547	goto out;
2548	}
2549
2550	/ Proactive reclaim initiated by userspace for anonymous memory only /
2551	if (swappiness == SWAPPINESS_ANON_ONLY) {
2552	WARN_ON_ONCE(!sc->proactive);
2553	scan_balance = SCAN_ANON;
2554	goto out;
2555	}
2556
2557	/*
2558	* Do not apply any pressure balancing cleverness when the
2559	* system is close to OOM, scan both anon and file equally
2560	* (unless the swappiness setting disagrees with swapping).
2561	*/
2562	if (!sc->priority && swappiness) {
2563	scan_balance = SCAN_EQUAL;
2564	goto out;
2565	}
2566
2567	/*
2568	* If the system is almost out of file pages, force-scan anon.
2569	*/
2570	if (sc->file_is_tiny) {
2571	scan_balance = SCAN_ANON;
2572	goto out;
2573	}
2574
2575	/*
2576	* If there is enough inactive page cache, we do not reclaim
2577	* anything from the anonymous working right now to make sure
2578	* a streaming file access pattern doesn't cause swapping.
2579	*/
2580	if (sc->cache_trim_mode) {
2581	scan_balance = SCAN_FILE;
2582	goto out;
2583	}
2584
2585	scan_balance = SCAN_FRACT;
2586	calculate_pressure_balance(sc, swappiness, fraction, denominator: &denominator);
2587
2588	out:
2589	for_each_evictable_lru(lru) {
2590	bool file = is_file_lru(lru);
2591	unsigned long lruvec_size;
2592	unsigned long scan;
2593
2594	lruvec_size = lruvec_lru_size(lruvec, lru, zone_idx: sc->reclaim_idx);
2595	scan = apply_proportional_protection(memcg, sc, scan: lruvec_size);
2596	scan >>= sc->priority;
2597
2598	/*
2599	* If the cgroup's already been deleted, make sure to
2600	* scrape out the remaining cache.
2601	*/
2602	if (!scan && !mem_cgroup_online(memcg))
2603	scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2604
2605	switch (scan_balance) {
2606	case SCAN_EQUAL:
2607	/ Scan lists relative to size /
2608	break;
2609	case SCAN_FRACT:
2610	/*
2611	* Scan types proportional to swappiness and
2612	* their relative recent reclaim efficiency.
2613	* Make sure we don't miss the last page on
2614	* the offlined memory cgroups because of a
2615	* round-off error.
2616	*/
2617	scan = mem_cgroup_online(memcg) ?
2618	div64_u64(dividend: scan * fraction[file], divisor: denominator) :
2619	DIV64_U64_ROUND_UP(scan * fraction[file],
2620	denominator);
2621	break;
2622	case SCAN_FILE:
2623	case SCAN_ANON:
2624	/ Scan one type exclusively /
2625	if ((scan_balance == SCAN_FILE) != file)
2626	scan = `0`;
2627	break;
2628	default:
2629	/ Look ma, no brain /
2630	BUG();
2631	}
2632
2633	nr[lru] = scan;
2634	}
2635	}
2636
2637	/*
2638	* Anonymous LRU management is a waste if there is
2639	* ultimately no way to reclaim the memory.
2640	*/
2641	static bool can_age_anon_pages(struct lruvec *lruvec,
2642	struct scan_control *sc)
2643	{
2644	/ Aging the anon LRU is valuable if swap is present: /
2645	if (total_swap_pages > `0`)
2646	return true;
2647
2648	/ Also valuable if anon pages can be demoted: /
2649	return can_demote(nid: lruvec_pgdat(lruvec)->node_id, sc,
2650	memcg: lruvec_memcg(lruvec));
2651	}
2652
2653	#ifdef CONFIG_LRU_GEN
2654
2655	#ifdef CONFIG_LRU_GEN_ENABLED
2656	DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
2657	#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
2658	#else
2659	DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
2660	#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
2661	#endif
2662
2663	static bool should_walk_mmu(void)
2664	{
2665	return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
2666	}
2667
2668	static bool should_clear_pmd_young(void)
2669	{
2670	return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
2671	}
2672
2673	/******************************************************************************
2674	* shorthand helpers
2675	******************************************************************************/
2676
2677	#define DEFINE_MAX_SEQ(lruvec) \
2678	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
2679
2680	#define DEFINE_MIN_SEQ(lruvec) \
2681	unsigned long min_seq[ANON_AND_FILE] = { \
2682	READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
2683	READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
2684	}
2685
2686	/ Get the min/max evictable type based on swappiness /
2687	#define min_type(swappiness) (!(swappiness))
2688	#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
2689
2690	#define evictable_min_seq(min_seq, swappiness) \
2691	min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
2692
2693	#define for_each_gen_type_zone(gen, type, zone) \
2694	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
2695	for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
2696	for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
2697
2698	#define for_each_evictable_type(type, swappiness) \
2699	for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
2700
2701	#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
2702	#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
2703
2704	static struct lruvec get_lruvec(struct* mem_cgroup memcg, int* nid)
2705	{
2706	struct pglist_data *pgdat = NODE_DATA(nid);
2707
2708	#ifdef CONFIG_MEMCG
2709	if (memcg) {
2710	struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
2711
2712	/ see the comment in mem_cgroup_lruvec() /
2713	if (!lruvec->pgdat)
2714	lruvec->pgdat = pgdat;
2715
2716	return lruvec;
2717	}
2718	#endif
2719	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
2720
2721	return &pgdat->__lruvec;
2722	}
2723
2724	static int get_swappiness(struct lruvec lruvec, struct* scan_control *sc)
2725	{
2726	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2727	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2728
2729	if (!sc->may_swap)
2730	return `0`;
2731
2732	if (!can_demote(nid: pgdat->node_id, sc, memcg) &&
2733	mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
2734	return `0`;
2735
2736	return sc_swappiness(sc, memcg);
2737	}
2738
2739	static int get_nr_gens(struct lruvec lruvec, int* type)
2740	{
2741	return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + `1`;
2742	}
2743
2744	static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
2745	{
2746	int type;
2747
2748	for (type = `0`; type < ANON_AND_FILE; type++) {
2749	int n = get_nr_gens(lruvec, type);
2750
2751	if (n < MIN_NR_GENS \|\| n > MAX_NR_GENS)
2752	return false;
2753	}
2754
2755	return true;
2756	}
2757
2758	/******************************************************************************
2759	* Bloom filters
2760	******************************************************************************/
2761
2762	/*
2763	* Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
2764	* n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
2765	* bits in a bitmap, k is the number of hash functions and n is the number of
2766	* inserted items.
2767	*
2768	* Page table walkers use one of the two filters to reduce their search space.
2769	* To get rid of non-leaf entries that no longer have enough leaf entries, the
2770	* aging uses the double-buffering technique to flip to the other filter each
2771	* time it produces a new generation. For non-leaf entries that have enough
2772	* leaf entries, the aging carries them over to the next generation in
2773	* walk_pmd_range(); the eviction also report them when walking the rmap
2774	* in lru_gen_look_around().
2775	*
2776	* For future optimizations:
2777	* 1. It's not necessary to keep both filters all the time. The spare one can be
2778	* freed after the RCU grace period and reallocated if needed again.
2779	* 2. And when reallocating, it's worth scaling its size according to the number
2780	* of inserted entries in the other filter, to reduce the memory overhead on
2781	* small systems and false positives on large systems.
2782	* 3. Jenkins' hash function is an alternative to Knuth's.
2783	*/
2784	#define BLOOM_FILTER_SHIFT 15
2785
2786	static inline int filter_gen_from_seq(unsigned long seq)
2787	{
2788	return seq % NR_BLOOM_FILTERS;
2789	}
2790
2791	static void get_item_key(void item, int* *key)
2792	{
2793	u32 hash = hash_ptr(ptr: item, BLOOM_FILTER_SHIFT * `2`);
2794
2795	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * `2` > BITS_PER_TYPE(u32));
2796
2797	key[`0`] = hash & (BIT(BLOOM_FILTER_SHIFT) - `1`);
2798	key[`1`] = hash >> BLOOM_FILTER_SHIFT;
2799	}
2800
2801	static bool test_bloom_filter(struct lru_gen_mm_state mm_state, unsigned* long seq,
2802	void *item)
2803	{
2804	int key[`2`];
2805	unsigned long *filter;
2806	int gen = filter_gen_from_seq(seq);
2807
2808	filter = READ_ONCE(mm_state->filters[gen]);
2809	if (!filter)
2810	return true;
2811
2812	get_item_key(item, key);
2813
2814	return test_bit(key[`0`], filter) && test_bit(key[`1`], filter);
2815	}
2816
2817	static void update_bloom_filter(struct lru_gen_mm_state mm_state, unsigned* long seq,
2818	void *item)
2819	{
2820	int key[`2`];
2821	unsigned long *filter;
2822	int gen = filter_gen_from_seq(seq);
2823
2824	filter = READ_ONCE(mm_state->filters[gen]);
2825	if (!filter)
2826	return;
2827
2828	get_item_key(item, key);
2829
2830	if (!test_bit(key[`0`], filter))
2831	set_bit(nr: key[`0`], addr: filter);
2832	if (!test_bit(key[`1`], filter))
2833	set_bit(nr: key[`1`], addr: filter);
2834	}
2835
2836	static void reset_bloom_filter(struct lru_gen_mm_state mm_state, unsigned* long seq)
2837	{
2838	unsigned long *filter;
2839	int gen = filter_gen_from_seq(seq);
2840
2841	filter = mm_state->filters[gen];
2842	if (filter) {
2843	bitmap_clear(map: filter, start: `0`, BIT(BLOOM_FILTER_SHIFT));
2844	return;
2845	}
2846
2847	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
2848	__GFP_HIGH \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
2849	WRITE_ONCE(mm_state->filters[gen], filter);
2850	}
2851
2852	/******************************************************************************
2853	* mm_struct list
2854	******************************************************************************/
2855
2856	#ifdef CONFIG_LRU_GEN_WALKS_MMU
2857
2858	static struct lru_gen_mm_list get_mm_list(struct* mem_cgroup *memcg)
2859	{
2860	static struct lru_gen_mm_list mm_list = {
2861	.fifo = LIST_HEAD_INIT(mm_list.fifo),
2862	.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
2863	};
2864
2865	#ifdef CONFIG_MEMCG
2866	if (memcg)
2867	return &memcg->mm_list;
2868	#endif
2869	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
2870
2871	return &mm_list;
2872	}
2873
2874	static struct lru_gen_mm_state get_mm_state(struct* lruvec *lruvec)
2875	{
2876	return &lruvec->mm_state;
2877	}
2878
2879	static struct mm_struct get_next_mm(struct* lru_gen_mm_walk *walk)
2880	{
2881	int key;
2882	struct mm_struct *mm;
2883	struct pglist_data *pgdat = lruvec_pgdat(lruvec: walk->lruvec);
2884	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec: walk->lruvec);
2885
2886	mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
2887	key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
2888
2889	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
2890	return NULL;
2891
2892	clear_bit(nr: key, addr: &mm->lru_gen.bitmap);
2893
2894	return mmget_not_zero(mm) ? mm : NULL;
2895	}
2896
2897	void lru_gen_add_mm(struct mm_struct *mm)
2898	{
2899	int nid;
2900	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
2901	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
2902
2903	VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
2904	#ifdef CONFIG_MEMCG
2905	VM_WARN_ON_ONCE(mm->lru_gen.memcg);
2906	mm->lru_gen.memcg = memcg;
2907	#endif
2908	spin_lock(lock: &mm_list->lock);
2909
2910	for_each_node_state(nid, N_MEMORY) {
2911	struct lruvec *lruvec = get_lruvec(memcg, nid);
2912	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2913
2914	/ the first addition since the last iteration /
2915	if (mm_state->tail == &mm_list->fifo)
2916	mm_state->tail = &mm->lru_gen.list;
2917	}
2918
2919	list_add_tail(new: &mm->lru_gen.list, head: &mm_list->fifo);
2920
2921	spin_unlock(lock: &mm_list->lock);
2922	}
2923
2924	void lru_gen_del_mm(struct mm_struct *mm)
2925	{
2926	int nid;
2927	struct lru_gen_mm_list *mm_list;
2928	struct mem_cgroup *memcg = NULL;
2929
2930	if (list_empty(head: &mm->lru_gen.list))
2931	return;
2932
2933	#ifdef CONFIG_MEMCG
2934	memcg = mm->lru_gen.memcg;
2935	#endif
2936	mm_list = get_mm_list(memcg);
2937
2938	spin_lock(lock: &mm_list->lock);
2939
2940	for_each_node(nid) {
2941	struct lruvec *lruvec = get_lruvec(memcg, nid);
2942	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2943
2944	/ where the current iteration continues after /
2945	if (mm_state->head == &mm->lru_gen.list)
2946	mm_state->head = mm_state->head->prev;
2947
2948	/ where the last iteration ended before /
2949	if (mm_state->tail == &mm->lru_gen.list)
2950	mm_state->tail = mm_state->tail->next;
2951	}
2952
2953	list_del_init(entry: &mm->lru_gen.list);
2954
2955	spin_unlock(lock: &mm_list->lock);
2956
2957	#ifdef CONFIG_MEMCG
2958	mem_cgroup_put(memcg: mm->lru_gen.memcg);
2959	mm->lru_gen.memcg = NULL;
2960	#endif
2961	}
2962
2963	#ifdef CONFIG_MEMCG
2964	void lru_gen_migrate_mm(struct mm_struct *mm)
2965	{
2966	struct mem_cgroup *memcg;
2967	struct task_struct *task = rcu_dereference_protected(mm->owner, true);
2968
2969	VM_WARN_ON_ONCE(task->mm != mm);
2970	lockdep_assert_held(&task->alloc_lock);
2971
2972	/ for mm_update_next_owner() /
2973	if (mem_cgroup_disabled())
2974	return;
2975
2976	/ migration can happen before addition /
2977	if (!mm->lru_gen.memcg)
2978	return;
2979
2980	rcu_read_lock();
2981	memcg = mem_cgroup_from_task(p: task);
2982	rcu_read_unlock();
2983	if (memcg == mm->lru_gen.memcg)
2984	return;
2985
2986	VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
2987
2988	lru_gen_del_mm(mm);
2989	lru_gen_add_mm(mm);
2990	}
2991	#endif
2992
2993	#else /* !CONFIG_LRU_GEN_WALKS_MMU */
2994
2995	static struct lru_gen_mm_list get_mm_list(struct* mem_cgroup *memcg)
2996	{
2997	return NULL;
2998	}
2999
3000	static struct lru_gen_mm_state get_mm_state(struct* lruvec *lruvec)
3001	{
3002	return NULL;
3003	}
3004
3005	static struct mm_struct get_next_mm(struct* lru_gen_mm_walk *walk)
3006	{
3007	return NULL;
3008	}
3009
3010	#endif
3011
3012	static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
3013	{
3014	int i;
3015	int hist;
3016	struct lruvec *lruvec = walk->lruvec;
3017	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3018
3019	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
3020
3021	hist = lru_hist_from_seq(seq: walk->seq);
3022
3023	for (i = `0`; i < NR_MM_STATS; i++) {
3024	WRITE_ONCE(mm_state->stats[hist][i],
3025	mm_state->stats[hist][i] + walk->mm_stats[i]);
3026	walk->mm_stats[i] = `0`;
3027	}
3028
3029	if (NR_HIST_GENS > `1` && last) {
3030	hist = lru_hist_from_seq(seq: walk->seq + `1`);
3031
3032	for (i = `0`; i < NR_MM_STATS; i++)
3033	WRITE_ONCE(mm_state->stats[hist][i], `0`);
3034	}
3035	}
3036
3037	static bool iterate_mm_list(struct lru_gen_mm_walk walk, struct* mm_struct **iter)
3038	{
3039	bool first = false;
3040	bool last = false;
3041	struct mm_struct *mm = NULL;
3042	struct lruvec *lruvec = walk->lruvec;
3043	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3044	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3045	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3046
3047	/*
3048	* mm_state->seq is incremented after each iteration of mm_list. There
3049	* are three interesting cases for this page table walker:
3050	* 1. It tries to start a new iteration with a stale max_seq: there is
3051	* nothing left to do.
3052	* 2. It started the next iteration: it needs to reset the Bloom filter
3053	* so that a fresh set of PTE tables can be recorded.
3054	* 3. It ended the current iteration: it needs to reset the mm stats
3055	* counters and tell its caller to increment max_seq.
3056	*/
3057	spin_lock(lock: &mm_list->lock);
3058
3059	VM_WARN_ON_ONCE(mm_state->seq + `1` < walk->seq);
3060
3061	if (walk->seq <= mm_state->seq)
3062	goto done;
3063
3064	if (!mm_state->head)
3065	mm_state->head = &mm_list->fifo;
3066
3067	if (mm_state->head == &mm_list->fifo)
3068	first = true;
3069
3070	do {
3071	mm_state->head = mm_state->head->next;
3072	if (mm_state->head == &mm_list->fifo) {
3073	WRITE_ONCE(mm_state->seq, mm_state->seq + `1`);
3074	last = true;
3075	break;
3076	}
3077
3078	/ force scan for those added after the last iteration /
3079	if (!mm_state->tail \|\| mm_state->tail == mm_state->head) {
3080	mm_state->tail = mm_state->head->next;
3081	walk->force_scan = true;
3082	}
3083	} while (!(mm = get_next_mm(walk)));
3084	done:
3085	if (*iter \|\| last)
3086	reset_mm_stats(walk, last);
3087
3088	spin_unlock(lock: &mm_list->lock);
3089
3090	if (mm && first)
3091	reset_bloom_filter(mm_state, seq: walk->seq + `1`);
3092
3093	if (*iter)
3094	mmput_async(*iter);
3095
3096	*iter = mm;
3097
3098	return last;
3099	}
3100
3101	static bool iterate_mm_list_nowalk(struct lruvec lruvec, unsigned* long seq)
3102	{
3103	bool success = false;
3104	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3105	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3106	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3107
3108	spin_lock(lock: &mm_list->lock);
3109
3110	VM_WARN_ON_ONCE(mm_state->seq + `1` < seq);
3111
3112	if (seq > mm_state->seq) {
3113	mm_state->head = NULL;
3114	mm_state->tail = NULL;
3115	WRITE_ONCE(mm_state->seq, mm_state->seq + `1`);
3116	success = true;
3117	}
3118
3119	spin_unlock(lock: &mm_list->lock);
3120
3121	return success;
3122	}
3123
3124	/******************************************************************************
3125	* PID controller
3126	******************************************************************************/
3127
3128	/*
3129	* A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3130	*
3131	* The P term is refaulted/(evicted+protected) from a tier in the generation
3132	* currently being evicted; the I term is the exponential moving average of the
3133	* P term over the generations previously evicted, using the smoothing factor
3134	* 1/2; the D term isn't supported.
3135	*
3136	* The setpoint (SP) is always the first tier of one type; the process variable
3137	* (PV) is either any tier of the other type or any other tier of the same
3138	* type.
3139	*
3140	* The error is the difference between the SP and the PV; the correction is to
3141	* turn off protection when SP>PV or turn on protection when SP<PV.
3142	*
3143	* For future optimizations:
3144	* 1. The D term may discount the other two terms over time so that long-lived
3145	* generations can resist stale information.
3146	*/
3147	struct ctrl_pos {
3148	unsigned long refaulted;
3149	unsigned long total;
3150	int gain;
3151	};
3152
3153	static void read_ctrl_pos(struct lruvec lruvec, int* type, int tier, int gain,
3154	struct ctrl_pos *pos)
3155	{
3156	int i;
3157	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3158	int hist = lru_hist_from_seq(seq: lrugen->min_seq[type]);
3159
3160	pos->gain = gain;
3161	pos->refaulted = pos->total = `0`;
3162
3163	for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - `1`); i++) {
3164	pos->refaulted += lrugen->avg_refaulted[type][i] +
3165	atomic_long_read(v: &lrugen->refaulted[hist][type][i]);
3166	pos->total += lrugen->avg_total[type][i] +
3167	lrugen->protected[hist][type][i] +
3168	atomic_long_read(v: &lrugen->evicted[hist][type][i]);
3169	}
3170	}
3171
3172	static void reset_ctrl_pos(struct lruvec lruvec, int* type, bool carryover)
3173	{
3174	int hist, tier;
3175	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3176	bool clear = carryover ? NR_HIST_GENS == `1` : NR_HIST_GENS > `1`;
3177	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + `1`;
3178
3179	lockdep_assert_held(&lruvec->lru_lock);
3180
3181	if (!carryover && !clear)
3182	return;
3183
3184	hist = lru_hist_from_seq(seq);
3185
3186	for (tier = `0`; tier < MAX_NR_TIERS; tier++) {
3187	if (carryover) {
3188	unsigned long sum;
3189
3190	sum = lrugen->avg_refaulted[type][tier] +
3191	atomic_long_read(v: &lrugen->refaulted[hist][type][tier]);
3192	WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / `2`);
3193
3194	sum = lrugen->avg_total[type][tier] +
3195	lrugen->protected[hist][type][tier] +
3196	atomic_long_read(v: &lrugen->evicted[hist][type][tier]);
3197	WRITE_ONCE(lrugen->avg_total[type][tier], sum / `2`);
3198	}
3199
3200	if (clear) {
3201	atomic_long_set(v: &lrugen->refaulted[hist][type][tier], i: `0`);
3202	atomic_long_set(v: &lrugen->evicted[hist][type][tier], i: `0`);
3203	WRITE_ONCE(lrugen->protected[hist][type][tier], `0`);
3204	}
3205	}
3206	}
3207
3208	static bool positive_ctrl_err(struct ctrl_pos sp, struct* ctrl_pos *pv)
3209	{
3210	/*
3211	* Return true if the PV has a limited number of refaults or a lower
3212	* refaulted/total than the SP.
3213	*/
3214	return pv->refaulted < MIN_LRU_BATCH \|\|
3215	pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
3216	(sp->refaulted + `1`) * pv->total * pv->gain;
3217	}
3218
3219	/******************************************************************************
3220	* the aging
3221	******************************************************************************/
3222
3223	/ promote pages accessed through page tables /
3224	static int folio_update_gen(struct folio folio, int* gen)
3225	{
3226	unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
3227
3228	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
3229
3230	/ see the comment on LRU_REFS_FLAGS /
3231	if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
3232	set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
3233	return -`1`;
3234	}
3235
3236	do {
3237	/ lru_gen_del_folio() has isolated this page? /
3238	if (!(old_flags & LRU_GEN_MASK))
3239	return -`1`;
3240
3241	new_flags = old_flags & ~(LRU_GEN_MASK \| LRU_REFS_FLAGS);
3242	new_flags \|= ((gen + `1UL`) << LRU_GEN_PGOFF) \| BIT(PG_workingset);
3243	} while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
3244
3245	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - `1`;
3246	}
3247
3248	/ protect pages accessed multiple times through file descriptors /
3249	static int folio_inc_gen(struct lruvec lruvec, struct* folio *folio, bool reclaiming)
3250	{
3251	int type = folio_is_file_lru(folio);
3252	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3253	int new_gen, old_gen = lru_gen_from_seq(seq: lrugen->min_seq[type]);
3254	unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
3255
3256	VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
3257
3258	do {
3259	new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - `1`;
3260	/ folio_update_gen() has promoted this page? /
3261	if (new_gen >= `0` && new_gen != old_gen)
3262	return new_gen;
3263
3264	new_gen = (old_gen + `1`) % MAX_NR_GENS;
3265
3266	new_flags = old_flags & ~(LRU_GEN_MASK \| LRU_REFS_FLAGS);
3267	new_flags \|= (new_gen + `1UL`) << LRU_GEN_PGOFF;
3268	/ for folio_end_writeback() /
3269	if (reclaiming)
3270	new_flags \|= BIT(PG_reclaim);
3271	} while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
3272
3273	lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3274
3275	return new_gen;
3276	}
3277
3278	static void update_batch_size(struct lru_gen_mm_walk walk, struct* folio *folio,
3279	int old_gen, int new_gen)
3280	{
3281	int type = folio_is_file_lru(folio);
3282	int zone = folio_zonenum(folio);
3283	int delta = folio_nr_pages(folio);
3284
3285	VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
3286	VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
3287
3288	walk->batched++;
3289
3290	walk->nr_pages[old_gen][type][zone] -= delta;
3291	walk->nr_pages[new_gen][type][zone] += delta;
3292	}
3293
3294	static void reset_batch_size(struct lru_gen_mm_walk *walk)
3295	{
3296	int gen, type, zone;
3297	struct lruvec *lruvec = walk->lruvec;
3298	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3299
3300	walk->batched = `0`;
3301
3302	for_each_gen_type_zone(gen, type, zone) {
3303	enum lru_list lru = type * LRU_INACTIVE_FILE;
3304	int delta = walk->nr_pages[gen][type][zone];
3305
3306	if (!delta)
3307	continue;
3308
3309	walk->nr_pages[gen][type][zone] = `0`;
3310	WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
3311	lrugen->nr_pages[gen][type][zone] + delta);
3312
3313	if (lru_gen_is_active(lruvec, gen))
3314	lru += LRU_ACTIVE;
3315	__update_lru_size(lruvec, lru, zone, delta);
3316	}
3317	}
3318
3319	static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
3320	{
3321	struct address_space *mapping;
3322	struct vm_area_struct *vma = args->vma;
3323	struct lru_gen_mm_walk *walk = args->private;
3324
3325	if (!vma_is_accessible(vma))
3326	return true;
3327
3328	if (is_vm_hugetlb_page(vma))
3329	return true;
3330
3331	if (!vma_has_recency(vma))
3332	return true;
3333
3334	if (vma->vm_flags & (VM_LOCKED \| VM_SPECIAL))
3335	return true;
3336
3337	if (vma == get_gate_vma(mm: vma->vm_mm))
3338	return true;
3339
3340	if (vma_is_anonymous(vma))
3341	return !walk->swappiness;
3342
3343	if (WARN_ON_ONCE(!vma->vm_file \|\| !vma->vm_file->f_mapping))
3344	return true;
3345
3346	mapping = vma->vm_file->f_mapping;
3347	if (mapping_unevictable(mapping))
3348	return true;
3349
3350	if (shmem_mapping(mapping))
3351	return !walk->swappiness;
3352
3353	if (walk->swappiness > MAX_SWAPPINESS)
3354	return true;
3355
3356	/ to exclude special mappings like dax, etc. /
3357	return !mapping->a_ops->read_folio;
3358	}
3359
3360	/*
3361	* Some userspace memory allocators map many single-page VMAs. Instead of
3362	* returning back to the PGD table for each of such VMAs, finish an entire PMD
3363	* table to reduce zigzags and improve cache performance.
3364	*/
3365	static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
3366	unsigned long vm_start, unsigned* long *vm_end)
3367	{
3368	unsigned long start = round_up(*vm_end, size);
3369	unsigned long end = (start \| ~mask) + `1`;
3370	VMA_ITERATOR(vmi, args->mm, start);
3371
3372	VM_WARN_ON_ONCE(mask & size);
3373	VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
3374
3375	for_each_vma(vmi, args->vma) {
3376	if (end && end <= args->vma->vm_start)
3377	return false;
3378
3379	if (should_skip_vma(start: args->vma->vm_start, end: args->vma->vm_end, args))
3380	continue;
3381
3382	*vm_start = max(start, args->vma->vm_start);
3383	*vm_end = min(end - `1`, args->vma->vm_end - `1`) + `1`;
3384
3385	return true;
3386	}
3387
3388	return false;
3389	}
3390
3391	static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct vma, unsigned* long addr,
3392	struct pglist_data *pgdat)
3393	{
3394	unsigned long pfn = pte_pfn(pte);
3395
3396	VM_WARN_ON_ONCE(addr < vma->vm_start \|\| addr >= vma->vm_end);
3397
3398	if (!pte_present(a: pte) \|\| is_zero_pfn(pfn))
3399	return -`1`;
3400
3401	if (WARN_ON_ONCE(pte_special(pte)))
3402	return -`1`;
3403
3404	if (!pte_young(pte) && !mm_has_notifiers(mm: vma->vm_mm))
3405	return -`1`;
3406
3407	if (WARN_ON_ONCE(!pfn_valid(pfn)))
3408	return -`1`;
3409
3410	if (pfn < pgdat->node_start_pfn \|\| pfn >= pgdat_end_pfn(pgdat))
3411	return -`1`;
3412
3413	return pfn;
3414	}
3415
3416	static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct vma, unsigned* long addr,
3417	struct pglist_data *pgdat)
3418	{
3419	unsigned long pfn = pmd_pfn(pmd);
3420
3421	VM_WARN_ON_ONCE(addr < vma->vm_start \|\| addr >= vma->vm_end);
3422
3423	if (!pmd_present(pmd) \|\| is_huge_zero_pmd(pmd))
3424	return -`1`;
3425
3426	if (!pmd_young(pmd) && !mm_has_notifiers(mm: vma->vm_mm))
3427	return -`1`;
3428
3429	if (WARN_ON_ONCE(!pfn_valid(pfn)))
3430	return -`1`;
3431
3432	if (pfn < pgdat->node_start_pfn \|\| pfn >= pgdat_end_pfn(pgdat))
3433	return -`1`;
3434
3435	return pfn;
3436	}
3437
3438	static struct folio get_pfn_folio(unsigned* long pfn, struct mem_cgroup *memcg,
3439	struct pglist_data *pgdat)
3440	{
3441	struct folio *folio = pfn_folio(pfn);
3442
3443	if (folio_lru_gen(folio) < `0`)
3444	return NULL;
3445
3446	if (folio_nid(folio) != pgdat->node_id)
3447	return NULL;
3448
3449	if (folio_memcg(folio) != memcg)
3450	return NULL;
3451
3452	return folio;
3453	}
3454
3455	static bool suitable_to_scan(int total, int young)
3456	{
3457	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), `2`, `8`);
3458
3459	/ suitable if the average number of young PTEs per cacheline is >=1 /
3460	return young * n >= total;
3461	}
3462
3463	static void walk_update_folio(struct lru_gen_mm_walk walk, struct* folio *folio,
3464	int new_gen, bool dirty)
3465	{
3466	int old_gen;
3467
3468	if (!folio)
3469	return;
3470
3471	if (dirty && !folio_test_dirty(folio) &&
3472	!(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3473	!folio_test_swapcache(folio)))
3474	folio_mark_dirty(folio);
3475
3476	if (walk) {
3477	old_gen = folio_update_gen(folio, gen: new_gen);
3478	if (old_gen >= `0` && old_gen != new_gen)
3479	update_batch_size(walk, folio, old_gen, new_gen);
3480	} else if (lru_gen_set_refs(folio)) {
3481	old_gen = folio_lru_gen(folio);
3482	if (old_gen >= `0` && old_gen != new_gen)
3483	folio_activate(folio);
3484	}
3485	}
3486
3487	static bool walk_pte_range(pmd_t pmd, unsigned* long start, unsigned long end,
3488	struct mm_walk *args)
3489	{
3490	int i;
3491	bool dirty;
3492	pte_t *pte;
3493	spinlock_t *ptl;
3494	unsigned long addr;
3495	int total = `0`;
3496	int young = `0`;
3497	struct folio *last = NULL;
3498	struct lru_gen_mm_walk *walk = args->private;
3499	struct mem_cgroup *memcg = lruvec_memcg(lruvec: walk->lruvec);
3500	struct pglist_data *pgdat = lruvec_pgdat(lruvec: walk->lruvec);
3501	DEFINE_MAX_SEQ(walk->lruvec);
3502	int gen = lru_gen_from_seq(seq: max_seq);
3503	pmd_t pmdval;
3504
3505	pte = pte_offset_map_rw_nolock(mm: args->mm, pmd, addr: start & PMD_MASK, pmdvalp: &pmdval, ptlp: &ptl);
3506	if (!pte)
3507	return false;
3508
3509	if (!spin_trylock(lock: ptl)) {
3510	pte_unmap(pte);
3511	return true;
3512	}
3513
3514	if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
3515	pte_unmap_unlock(pte, ptl);
3516	return false;
3517	}
3518
3519	arch_enter_lazy_mmu_mode();
3520	restart:
3521	for (i = pte_index(address: start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
3522	unsigned long pfn;
3523	struct folio *folio;
3524	pte_t ptent = ptep_get(ptep: pte + i);
3525
3526	total++;
3527	walk->mm_stats[MM_LEAF_TOTAL]++;
3528
3529	pfn = get_pte_pfn(pte: ptent, vma: args->vma, addr, pgdat);
3530	if (pfn == -`1`)
3531	continue;
3532
3533	folio = get_pfn_folio(pfn, memcg, pgdat);
3534	if (!folio)
3535	continue;
3536
3537	if (!ptep_clear_young_notify(args->vma, addr, pte + i))
3538	continue;
3539
3540	if (last != folio) {
3541	walk_update_folio(walk, folio: last, new_gen: gen, dirty);
3542
3543	last = folio;
3544	dirty = false;
3545	}
3546
3547	if (pte_dirty(pte: ptent))
3548	dirty = true;
3549
3550	young++;
3551	walk->mm_stats[MM_LEAF_YOUNG]++;
3552	}
3553
3554	walk_update_folio(walk, folio: last, new_gen: gen, dirty);
3555	last = NULL;
3556
3557	if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, vm_start: &start, vm_end: &end))
3558	goto restart;
3559
3560	arch_leave_lazy_mmu_mode();
3561	pte_unmap_unlock(pte, ptl);
3562
3563	return suitable_to_scan(total, young);
3564	}
3565
3566	static void walk_pmd_range_locked(pud_t pud, unsigned* long addr, struct vm_area_struct *vma,
3567	struct mm_walk args, unsigned* long bitmap, unsigned* long *first)
3568	{
3569	int i;
3570	bool dirty;
3571	pmd_t *pmd;
3572	spinlock_t *ptl;
3573	struct folio *last = NULL;
3574	struct lru_gen_mm_walk *walk = args->private;
3575	struct mem_cgroup *memcg = lruvec_memcg(lruvec: walk->lruvec);
3576	struct pglist_data *pgdat = lruvec_pgdat(lruvec: walk->lruvec);
3577	DEFINE_MAX_SEQ(walk->lruvec);
3578	int gen = lru_gen_from_seq(seq: max_seq);
3579
3580	VM_WARN_ON_ONCE(pud_leaf(*pud));
3581
3582	/ try to batch at most 1+MIN_LRU_BATCH+1 entries /
3583	if (*first == -`1`) {
3584	*first = addr;
3585	bitmap_zero(dst: bitmap, MIN_LRU_BATCH);
3586	return;
3587	}
3588
3589	i = addr == -`1` ? `0` : pmd_index(address: addr) - pmd_index(address: *first);
3590	if (i && i <= MIN_LRU_BATCH) {
3591	__set_bit(i - `1`, bitmap);
3592	return;
3593	}
3594
3595	pmd = pmd_offset(pud, address: *first);
3596
3597	ptl = pmd_lockptr(mm: args->mm, pmd);
3598	if (!spin_trylock(lock: ptl))
3599	goto done;
3600
3601	arch_enter_lazy_mmu_mode();
3602
3603	do {
3604	unsigned long pfn;
3605	struct folio *folio;
3606
3607	/ don't round down the first address /
3608	addr = i ? (first & PMD_MASK) + i PMD_SIZE : *first;
3609
3610	if (!pmd_present(pmd: pmd[i]))
3611	goto next;
3612
3613	if (!pmd_trans_huge(pmd: pmd[i])) {
3614	if (!walk->force_scan && should_clear_pmd_young() &&
3615	!mm_has_notifiers(mm: args->mm))
3616	pmdp_test_and_clear_young(vma, addr, pmdp: pmd + i);
3617	goto next;
3618	}
3619
3620	pfn = get_pmd_pfn(pmd: pmd[i], vma, addr, pgdat);
3621	if (pfn == -`1`)
3622	goto next;
3623
3624	folio = get_pfn_folio(pfn, memcg, pgdat);
3625	if (!folio)
3626	goto next;
3627
3628	if (!pmdp_clear_young_notify(vma, addr, pmd + i))
3629	goto next;
3630
3631	if (last != folio) {
3632	walk_update_folio(walk, folio: last, new_gen: gen, dirty);
3633
3634	last = folio;
3635	dirty = false;
3636	}
3637
3638	if (pmd_dirty(pmd: pmd[i]))
3639	dirty = true;
3640
3641	walk->mm_stats[MM_LEAF_YOUNG]++;
3642	next:
3643	i = i > MIN_LRU_BATCH ? `0` : find_next_bit(addr: bitmap, MIN_LRU_BATCH, offset: i) + `1`;
3644	} while (i <= MIN_LRU_BATCH);
3645
3646	walk_update_folio(walk, folio: last, new_gen: gen, dirty);
3647
3648	arch_leave_lazy_mmu_mode();
3649	spin_unlock(lock: ptl);
3650	done:
3651	*first = -`1`;
3652	}
3653
3654	static void walk_pmd_range(pud_t pud, unsigned* long start, unsigned long end,
3655	struct mm_walk *args)
3656	{
3657	int i;
3658	pmd_t *pmd;
3659	unsigned long next;
3660	unsigned long addr;
3661	struct vm_area_struct *vma;
3662	DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
3663	unsigned long first = -`1`;
3664	struct lru_gen_mm_walk *walk = args->private;
3665	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec: walk->lruvec);
3666
3667	VM_WARN_ON_ONCE(pud_leaf(*pud));
3668
3669	/*
3670	* Finish an entire PMD in two passes: the first only reaches to PTE
3671	* tables to avoid taking the PMD lock; the second, if necessary, takes
3672	* the PMD lock to clear the accessed bit in PMD entries.
3673	*/
3674	pmd = pmd_offset(pud, address: start & PUD_MASK);
3675	restart:
3676	/ walk_pte_range() may call get_next_vma() /
3677	vma = args->vma;
3678	for (i = pmd_index(address: start), addr = start; addr != end; i++, addr = next) {
3679	pmd_t val = pmdp_get_lockless(pmdp: pmd + i);
3680
3681	next = pmd_addr_end(addr, end);
3682
3683	if (!pmd_present(pmd: val) \|\| is_huge_zero_pmd(pmd: val)) {
3684	walk->mm_stats[MM_LEAF_TOTAL]++;
3685	continue;
3686	}
3687
3688	if (pmd_trans_huge(pmd: val)) {
3689	struct pglist_data *pgdat = lruvec_pgdat(lruvec: walk->lruvec);
3690	unsigned long pfn = get_pmd_pfn(pmd: val, vma, addr, pgdat);
3691
3692	walk->mm_stats[MM_LEAF_TOTAL]++;
3693
3694	if (pfn != -`1`)
3695	walk_pmd_range_locked(pud, addr, vma, args, bitmap, first: &first);
3696	continue;
3697	}
3698
3699	if (!walk->force_scan && should_clear_pmd_young() &&
3700	!mm_has_notifiers(mm: args->mm)) {
3701	if (!pmd_young(pmd: val))
3702	continue;
3703
3704	walk_pmd_range_locked(pud, addr, vma, args, bitmap, first: &first);
3705	}
3706
3707	if (!walk->force_scan && !test_bloom_filter(mm_state, seq: walk->seq, item: pmd + i))
3708	continue;
3709
3710	walk->mm_stats[MM_NONLEAF_FOUND]++;
3711
3712	if (!walk_pte_range(pmd: &val, start: addr, end: next, args))
3713	continue;
3714
3715	walk->mm_stats[MM_NONLEAF_ADDED]++;
3716
3717	/ carry over to the next generation /
3718	update_bloom_filter(mm_state, seq: walk->seq + `1`, item: pmd + i);
3719	}
3720
3721	walk_pmd_range_locked(pud, addr: -`1`, vma, args, bitmap, first: &first);
3722
3723	if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, vm_start: &start, vm_end: &end))
3724	goto restart;
3725	}
3726
3727	static int walk_pud_range(p4d_t p4d, unsigned* long start, unsigned long end,
3728	struct mm_walk *args)
3729	{
3730	int i;
3731	pud_t *pud;
3732	unsigned long addr;
3733	unsigned long next;
3734	struct lru_gen_mm_walk *walk = args->private;
3735
3736	VM_WARN_ON_ONCE(p4d_leaf(*p4d));
3737
3738	pud = pud_offset(p4d, address: start & P4D_MASK);
3739	restart:
3740	for (i = pud_index(address: start), addr = start; addr != end; i++, addr = next) {
3741	pud_t val = pudp_get(pudp: pud + i);
3742
3743	next = pud_addr_end(addr, end);
3744
3745	if (!pud_present(pud: val) \|\| WARN_ON_ONCE(pud_leaf(val)))
3746	continue;
3747
3748	walk_pmd_range(pud: &val, start: addr, end: next, args);
3749
3750	if (need_resched() \|\| walk->batched >= MAX_LRU_BATCH) {
3751	end = (addr \| ~PUD_MASK) + `1`;
3752	goto done;
3753	}
3754	}
3755
3756	if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, vm_start: &start, vm_end: &end))
3757	goto restart;
3758
3759	end = round_up(end, P4D_SIZE);
3760	done:
3761	if (!end \|\| !args->vma)
3762	return `1`;
3763
3764	walk->next_addr = max(end, args->vma->vm_start);
3765
3766	return -EAGAIN;
3767	}
3768
3769	static void walk_mm(struct mm_struct mm, struct* lru_gen_mm_walk *walk)
3770	{
3771	static const struct mm_walk_ops mm_walk_ops = {
3772	.test_walk = should_skip_vma,
3773	.p4d_entry = walk_pud_range,
3774	.walk_lock = PGWALK_RDLOCK,
3775	};
3776	int err;
3777	struct lruvec *lruvec = walk->lruvec;
3778
3779	walk->next_addr = FIRST_USER_ADDRESS;
3780
3781	do {
3782	DEFINE_MAX_SEQ(lruvec);
3783
3784	err = -EBUSY;
3785
3786	/ another thread might have called inc_max_seq() /
3787	if (walk->seq != max_seq)
3788	break;
3789
3790	/ the caller might be holding the lock for write /
3791	if (mmap_read_trylock(mm)) {
3792	err = walk_page_range(mm, start: walk->next_addr, ULONG_MAX, ops: &mm_walk_ops, private: walk);
3793
3794	mmap_read_unlock(mm);
3795	}
3796
3797	if (walk->batched) {
3798	spin_lock_irq(lock: &lruvec->lru_lock);
3799	reset_batch_size(walk);
3800	spin_unlock_irq(lock: &lruvec->lru_lock);
3801	}
3802
3803	cond_resched();
3804	} while (err == -EAGAIN);
3805	}
3806
3807	static struct lru_gen_mm_walk set_mm_walk(struct* pglist_data *pgdat, bool force_alloc)
3808	{
3809	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
3810
3811	if (pgdat && current_is_kswapd()) {
3812	VM_WARN_ON_ONCE(walk);
3813
3814	walk = &pgdat->mm_walk;
3815	} else if (!walk && force_alloc) {
3816	VM_WARN_ON_ONCE(current_is_kswapd());
3817
3818	walk = kzalloc(sizeof(*walk), __GFP_HIGH \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
3819	}
3820
3821	current->reclaim_state->mm_walk = walk;
3822
3823	return walk;
3824	}
3825
3826	static void clear_mm_walk(void)
3827	{
3828	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
3829
3830	VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, `0`, sizeof(walk->nr_pages)));
3831	VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, `0`, sizeof(walk->mm_stats)));
3832
3833	current->reclaim_state->mm_walk = NULL;
3834
3835	if (!current_is_kswapd())
3836	kfree(objp: walk);
3837	}
3838
3839	static bool inc_min_seq(struct lruvec lruvec, int* type, int swappiness)
3840	{
3841	int zone;
3842	int remaining = MAX_LRU_BATCH;
3843	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3844	int hist = lru_hist_from_seq(seq: lrugen->min_seq[type]);
3845	int new_gen, old_gen = lru_gen_from_seq(seq: lrugen->min_seq[type]);
3846
3847	/ For file type, skip the check if swappiness is anon only /
3848	if (type && (swappiness == SWAPPINESS_ANON_ONLY))
3849	goto done;
3850
3851	/ For anon type, skip the check if swappiness is zero (file only) /
3852	if (!type && !swappiness)
3853	goto done;
3854
3855	/ prevent cold/hot inversion if the type is evictable /
3856	for (zone = `0`; zone < MAX_NR_ZONES; zone++) {
3857	struct list_head *head = &lrugen->folios[old_gen][type][zone];
3858
3859	while (!list_empty(head)) {
3860	struct folio *folio = lru_to_folio(head);
3861	int refs = folio_lru_refs(folio);
3862	bool workingset = folio_test_workingset(folio);
3863
3864	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
3865	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
3866	VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
3867	VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
3868
3869	new_gen = folio_inc_gen(lruvec, folio, false);
3870	list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
3871
3872	/ don't count the workingset being lazily promoted /
3873	if (refs + workingset != BIT(LRU_REFS_WIDTH) + `1`) {
3874	int tier = lru_tier_from_refs(refs, workingset);
3875	int delta = folio_nr_pages(folio);
3876
3877	WRITE_ONCE(lrugen->protected[hist][type][tier],
3878	lrugen->protected[hist][type][tier] + delta);
3879	}
3880
3881	if (!--remaining)
3882	return false;
3883	}
3884	}
3885	done:
3886	reset_ctrl_pos(lruvec, type, carryover: true);
3887	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + `1`);
3888
3889	return true;
3890	}
3891
3892	static bool try_to_inc_min_seq(struct lruvec lruvec, int* swappiness)
3893	{
3894	int gen, type, zone;
3895	bool success = false;
3896	bool seq_inc_flag = false;
3897	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3898	DEFINE_MIN_SEQ(lruvec);
3899
3900	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3901
3902	/ find the oldest populated generation /
3903	for_each_evictable_type(type, swappiness) {
3904	while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
3905	gen = lru_gen_from_seq(seq: min_seq[type]);
3906
3907	for (zone = `0`; zone < MAX_NR_ZONES; zone++) {
3908	if (!list_empty(&lrugen->folios[gen][type][zone]))
3909	goto next;
3910	}
3911
3912	min_seq[type]++;
3913	seq_inc_flag = true;
3914	}
3915	next:
3916	;
3917	}
3918
3919	/*
3920	* If min_seq[type] of both anonymous and file is not increased,
3921	* we can directly return false to avoid unnecessary checking
3922	* overhead later.
3923	*/
3924	if (!seq_inc_flag)
3925	return success;
3926
3927	/ see the comment on lru_gen_folio /
3928	if (swappiness && swappiness <= MAX_SWAPPINESS) {
3929	unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
3930
3931	if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
3932	min_seq[LRU_GEN_ANON] = seq;
3933	else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
3934	min_seq[LRU_GEN_FILE] = seq;
3935	}
3936
3937	for_each_evictable_type(type, swappiness) {
3938	if (min_seq[type] <= lrugen->min_seq[type])
3939	continue;
3940
3941	reset_ctrl_pos(lruvec, type, carryover: true);
3942	WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
3943	success = true;
3944	}
3945
3946	return success;
3947	}
3948
3949	static bool inc_max_seq(struct lruvec lruvec, unsigned* long seq, int swappiness)
3950	{
3951	bool success;
3952	int prev, next;
3953	int type, zone;
3954	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3955	restart:
3956	if (seq < READ_ONCE(lrugen->max_seq))
3957	return false;
3958
3959	spin_lock_irq(lock: &lruvec->lru_lock);
3960
3961	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3962
3963	success = seq == lrugen->max_seq;
3964	if (!success)
3965	goto unlock;
3966
3967	for (type = `0`; type < ANON_AND_FILE; type++) {
3968	if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
3969	continue;
3970
3971	if (inc_min_seq(lruvec, type, swappiness))
3972	continue;
3973
3974	spin_unlock_irq(lock: &lruvec->lru_lock);
3975	cond_resched();
3976	goto restart;
3977	}
3978
3979	/*
3980	* Update the active/inactive LRU sizes for compatibility. Both sides of
3981	* the current max_seq need to be covered, since max_seq+1 can overlap
3982	* with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
3983	* overlap, cold/hot inversion happens.
3984	*/
3985	prev = lru_gen_from_seq(seq: lrugen->max_seq - `1`);
3986	next = lru_gen_from_seq(seq: lrugen->max_seq + `1`);
3987
3988	for (type = `0`; type < ANON_AND_FILE; type++) {
3989	for (zone = `0`; zone < MAX_NR_ZONES; zone++) {
3990	enum lru_list lru = type * LRU_INACTIVE_FILE;
3991	long delta = lrugen->nr_pages[prev][type][zone] -
3992	lrugen->nr_pages[next][type][zone];
3993
3994	if (!delta)
3995	continue;
3996
3997	__update_lru_size(lruvec, lru, zone, delta);
3998	__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
3999	}
4000	}
4001
4002	for (type = `0`; type < ANON_AND_FILE; type++)
4003	reset_ctrl_pos(lruvec, type, carryover: false);
4004
4005	WRITE_ONCE(lrugen->timestamps[next], jiffies);
4006	/ make sure preceding modifications appear /
4007	smp_store_release(&lrugen->max_seq, lrugen->max_seq + `1`);
4008	unlock:
4009	spin_unlock_irq(lock: &lruvec->lru_lock);
4010
4011	return success;
4012	}
4013
4014	static bool try_to_inc_max_seq(struct lruvec lruvec, unsigned* long seq,
4015	int swappiness, bool force_scan)
4016	{
4017	bool success;
4018	struct lru_gen_mm_walk *walk;
4019	struct mm_struct *mm = NULL;
4020	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4021	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
4022
4023	VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
4024
4025	if (!mm_state)
4026	return inc_max_seq(lruvec, seq, swappiness);
4027
4028	/ see the comment in iterate_mm_list() /
4029	if (seq <= READ_ONCE(mm_state->seq))
4030	return false;
4031
4032	/*
4033	* If the hardware doesn't automatically set the accessed bit, fallback
4034	* to lru_gen_look_around(), which only clears the accessed bit in a
4035	* handful of PTEs. Spreading the work out over a period of time usually
4036	* is less efficient, but it avoids bursty page faults.
4037	*/
4038	if (!should_walk_mmu()) {
4039	success = iterate_mm_list_nowalk(lruvec, seq);
4040	goto done;
4041	}
4042
4043	walk = set_mm_walk(NULL, force_alloc: true);
4044	if (!walk) {
4045	success = iterate_mm_list_nowalk(lruvec, seq);
4046	goto done;
4047	}
4048
4049	walk->lruvec = lruvec;
4050	walk->seq = seq;
4051	walk->swappiness = swappiness;
4052	walk->force_scan = force_scan;
4053
4054	do {
4055	success = iterate_mm_list(walk, iter: &mm);
4056	if (mm)
4057	walk_mm(mm, walk);
4058	} while (mm);
4059	done:
4060	if (success) {
4061	success = inc_max_seq(lruvec, seq, swappiness);
4062	WARN_ON_ONCE(!success);
4063	}
4064
4065	return success;
4066	}
4067
4068	/******************************************************************************
4069	* working set protection
4070	******************************************************************************/
4071
4072	static void set_initial_priority(struct pglist_data pgdat, struct* scan_control *sc)
4073	{
4074	int priority;
4075	unsigned long reclaimable;
4076
4077	if (sc->priority != DEF_PRIORITY \|\| sc->nr_to_reclaim < MIN_LRU_BATCH)
4078	return;
4079	/*
4080	* Determine the initial priority based on
4081	* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
4082	* where reclaimed_to_scanned_ratio = inactive / total.
4083	*/
4084	reclaimable = node_page_state(pgdat, item: NR_INACTIVE_FILE);
4085	if (can_reclaim_anon_pages(NULL, nid: pgdat->node_id, sc))
4086	reclaimable += node_page_state(pgdat, item: NR_INACTIVE_ANON);
4087
4088	/ round down reclaimable and round up sc->nr_to_reclaim /
4089	priority = fls_long(l: reclaimable) - `1` - fls_long(l: sc->nr_to_reclaim - `1`);
4090
4091	/*
4092	* The estimation is based on LRU pages only, so cap it to prevent
4093	* overshoots of shrinker objects by large margins.
4094	*/
4095	sc->priority = clamp(priority, DEF_PRIORITY / `2`, DEF_PRIORITY);
4096	}
4097
4098	static bool lruvec_is_sizable(struct lruvec lruvec, struct* scan_control *sc)
4099	{
4100	int gen, type, zone;
4101	unsigned long total = `0`;
4102	int swappiness = get_swappiness(lruvec, sc);
4103	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4104	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4105	DEFINE_MAX_SEQ(lruvec);
4106	DEFINE_MIN_SEQ(lruvec);
4107
4108	for_each_evictable_type(type, swappiness) {
4109	unsigned long seq;
4110
4111	for (seq = min_seq[type]; seq <= max_seq; seq++) {
4112	gen = lru_gen_from_seq(seq);
4113
4114	for (zone = `0`; zone < MAX_NR_ZONES; zone++)
4115	total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), `0L`);
4116	}
4117	}
4118
4119	/ whether the size is big enough to be helpful /
4120	return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
4121	}
4122
4123	static bool lruvec_is_reclaimable(struct lruvec lruvec, struct* scan_control *sc,
4124	unsigned long min_ttl)
4125	{
4126	int gen;
4127	unsigned long birth;
4128	int swappiness = get_swappiness(lruvec, sc);
4129	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4130	DEFINE_MIN_SEQ(lruvec);
4131
4132	if (mem_cgroup_below_min(NULL, memcg))
4133	return false;
4134
4135	if (!lruvec_is_sizable(lruvec, sc))
4136	return false;
4137
4138	gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
4139	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
4140
4141	return time_is_before_jiffies(birth + min_ttl);
4142	}
4143
4144	/ to protect the working set of the last N jiffies /
4145	static unsigned long lru_gen_min_ttl __read_mostly;
4146
4147	static void lru_gen_age_node(struct pglist_data pgdat, struct* scan_control *sc)
4148	{
4149	struct mem_cgroup *memcg;
4150	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
4151	bool reclaimable = !min_ttl;
4152
4153	VM_WARN_ON_ONCE(!current_is_kswapd());
4154
4155	set_initial_priority(pgdat, sc);
4156
4157	memcg = mem_cgroup_iter(NULL, NULL, NULL);
4158	do {
4159	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4160
4161	mem_cgroup_calculate_protection(NULL, memcg);
4162
4163	if (!reclaimable)
4164	reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
4165	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4166
4167	/*
4168	* The main goal is to OOM kill if every generation from all memcgs is
4169	* younger than min_ttl. However, another possibility is all memcgs are
4170	* either too small or below min.
4171	*/
4172	if (!reclaimable && mutex_trylock(&oom_lock)) {
4173	struct oom_control oc = {
4174	.gfp_mask = sc->gfp_mask,
4175	};
4176
4177	out_of_memory(oc: &oc);
4178
4179	mutex_unlock(lock: &oom_lock);
4180	}
4181	}
4182
4183	/******************************************************************************
4184	* rmap/PT walk feedback
4185	******************************************************************************/
4186
4187	/*
4188	* This function exploits spatial locality when shrink_folio_list() walks the
4189	* rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
4190	* the scan was done cacheline efficiently, it adds the PMD entry pointing to
4191	* the PTE table to the Bloom filter. This forms a feedback loop between the
4192	* eviction and the aging.
4193	*/
4194	bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4195	{
4196	int i;
4197	bool dirty;
4198	unsigned long start;
4199	unsigned long end;
4200	struct lru_gen_mm_walk *walk;
4201	struct folio *last = NULL;
4202	int young = `1`;
4203	pte_t *pte = pvmw->pte;
4204	unsigned long addr = pvmw->address;
4205	struct vm_area_struct *vma = pvmw->vma;
4206	struct folio *folio = pfn_folio(pfn: pvmw->pfn);
4207	struct mem_cgroup *memcg = folio_memcg(folio);
4208	struct pglist_data *pgdat = folio_pgdat(folio);
4209	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4210	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
4211	DEFINE_MAX_SEQ(lruvec);
4212	int gen = lru_gen_from_seq(seq: max_seq);
4213
4214	lockdep_assert_held(pvmw->ptl);
4215	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
4216
4217	if (!ptep_clear_young_notify(vma, addr, pte))
4218	return false;
4219
4220	if (spin_is_contended(lock: pvmw->ptl))
4221	return true;
4222
4223	/ exclude special VMAs containing anon pages from COW /
4224	if (vma->vm_flags & VM_SPECIAL)
4225	return true;
4226
4227	/ avoid taking the LRU lock under the PTL when possible /
4228	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
4229
4230	start = max(addr & PMD_MASK, vma->vm_start);
4231	end = min(addr \| ~PMD_MASK, vma->vm_end - `1`) + `1`;
4232
4233	if (end - start == PAGE_SIZE)
4234	return true;
4235
4236	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
4237	if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / `2`)
4238	end = start + MIN_LRU_BATCH * PAGE_SIZE;
4239	else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / `2`)
4240	start = end - MIN_LRU_BATCH * PAGE_SIZE;
4241	else {
4242	start = addr - MIN_LRU_BATCH * PAGE_SIZE / `2`;
4243	end = addr + MIN_LRU_BATCH * PAGE_SIZE / `2`;
4244	}
4245	}
4246
4247	arch_enter_lazy_mmu_mode();
4248
4249	pte -= (addr - start) / PAGE_SIZE;
4250
4251	for (i = `0`, addr = start; addr != end; i++, addr += PAGE_SIZE) {
4252	unsigned long pfn;
4253	pte_t ptent = ptep_get(ptep: pte + i);
4254
4255	pfn = get_pte_pfn(pte: ptent, vma, addr, pgdat);
4256	if (pfn == -`1`)
4257	continue;
4258
4259	folio = get_pfn_folio(pfn, memcg, pgdat);
4260	if (!folio)
4261	continue;
4262
4263	if (!ptep_clear_young_notify(vma, addr, pte + i))
4264	continue;
4265
4266	if (last != folio) {
4267	walk_update_folio(walk, folio: last, new_gen: gen, dirty);
4268
4269	last = folio;
4270	dirty = false;
4271	}
4272
4273	if (pte_dirty(pte: ptent))
4274	dirty = true;
4275
4276	young++;
4277	}
4278
4279	walk_update_folio(walk, folio: last, new_gen: gen, dirty);
4280
4281	arch_leave_lazy_mmu_mode();
4282
4283	/ feedback from rmap walkers to page table walkers /
4284	if (mm_state && suitable_to_scan(total: i, young))
4285	update_bloom_filter(mm_state, seq: max_seq, item: pvmw->pmd);
4286
4287	return true;
4288	}
4289
4290	/******************************************************************************
4291	* memcg LRU
4292	******************************************************************************/
4293
4294	/ see the comment on MEMCG_NR_GENS /
4295	enum {
4296	MEMCG_LRU_NOP,
4297	MEMCG_LRU_HEAD,
4298	MEMCG_LRU_TAIL,
4299	MEMCG_LRU_OLD,
4300	MEMCG_LRU_YOUNG,
4301	};
4302
4303	static void lru_gen_rotate_memcg(struct lruvec lruvec, int* op)
4304	{
4305	int seg;
4306	int old, new;
4307	unsigned long flags;
4308	int bin = get_random_u32_below(MEMCG_NR_BINS);
4309	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4310
4311	spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
4312
4313	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
4314
4315	seg = `0`;
4316	new = old = lruvec->lrugen.gen;
4317
4318	/ see the comment on MEMCG_NR_GENS /
4319	if (op == MEMCG_LRU_HEAD)
4320	seg = MEMCG_LRU_HEAD;
4321	else if (op == MEMCG_LRU_TAIL)
4322	seg = MEMCG_LRU_TAIL;
4323	else if (op == MEMCG_LRU_OLD)
4324	new = get_memcg_gen(pgdat->memcg_lru.seq);
4325	else if (op == MEMCG_LRU_YOUNG)
4326	new = get_memcg_gen(pgdat->memcg_lru.seq + `1`);
4327	else
4328	VM_WARN_ON_ONCE(true);
4329
4330	WRITE_ONCE(lruvec->lrugen.seg, seg);
4331	WRITE_ONCE(lruvec->lrugen.gen, new);
4332
4333	hlist_nulls_del_rcu(n: &lruvec->lrugen.list);
4334
4335	if (op == MEMCG_LRU_HEAD \|\| op == MEMCG_LRU_OLD)
4336	hlist_nulls_add_head_rcu(n: &lruvec->lrugen.list, h: &pgdat->memcg_lru.fifo[new][bin]);
4337	else
4338	hlist_nulls_add_tail_rcu(n: &lruvec->lrugen.list, h: &pgdat->memcg_lru.fifo[new][bin]);
4339
4340	pgdat->memcg_lru.nr_memcgs[old]--;
4341	pgdat->memcg_lru.nr_memcgs[new]++;
4342
4343	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
4344	WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + `1`);
4345
4346	spin_unlock_irqrestore(lock: &pgdat->memcg_lru.lock, flags);
4347	}
4348
4349	#ifdef CONFIG_MEMCG
4350
4351	void lru_gen_online_memcg(struct mem_cgroup *memcg)
4352	{
4353	int gen;
4354	int nid;
4355	int bin = get_random_u32_below(MEMCG_NR_BINS);
4356
4357	for_each_node(nid) {
4358	struct pglist_data *pgdat = NODE_DATA(nid);
4359	struct lruvec *lruvec = get_lruvec(memcg, nid);
4360
4361	spin_lock_irq(lock: &pgdat->memcg_lru.lock);
4362
4363	VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
4364
4365	gen = get_memcg_gen(pgdat->memcg_lru.seq);
4366
4367	lruvec->lrugen.gen = gen;
4368
4369	hlist_nulls_add_tail_rcu(n: &lruvec->lrugen.list, h: &pgdat->memcg_lru.fifo[gen][bin]);
4370	pgdat->memcg_lru.nr_memcgs[gen]++;
4371
4372	spin_unlock_irq(lock: &pgdat->memcg_lru.lock);
4373	}
4374	}
4375
4376	void lru_gen_offline_memcg(struct mem_cgroup *memcg)
4377	{
4378	int nid;
4379
4380	for_each_node(nid) {
4381	struct lruvec *lruvec = get_lruvec(memcg, nid);
4382
4383	lru_gen_rotate_memcg(lruvec, op: MEMCG_LRU_OLD);
4384	}
4385	}
4386
4387	void lru_gen_release_memcg(struct mem_cgroup *memcg)
4388	{
4389	int gen;
4390	int nid;
4391
4392	for_each_node(nid) {
4393	struct pglist_data *pgdat = NODE_DATA(nid);
4394	struct lruvec *lruvec = get_lruvec(memcg, nid);
4395
4396	spin_lock_irq(lock: &pgdat->memcg_lru.lock);
4397
4398	if (hlist_nulls_unhashed(h: &lruvec->lrugen.list))
4399	goto unlock;
4400
4401	gen = lruvec->lrugen.gen;
4402
4403	hlist_nulls_del_init_rcu(n: &lruvec->lrugen.list);
4404	pgdat->memcg_lru.nr_memcgs[gen]--;
4405
4406	if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
4407	WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + `1`);
4408	unlock:
4409	spin_unlock_irq(lock: &pgdat->memcg_lru.lock);
4410	}
4411	}
4412
4413	void lru_gen_soft_reclaim(struct mem_cgroup memcg, int* nid)
4414	{
4415	struct lruvec *lruvec = get_lruvec(memcg, nid);
4416
4417	/ see the comment on MEMCG_NR_GENS /
4418	if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
4419	lru_gen_rotate_memcg(lruvec, op: MEMCG_LRU_HEAD);
4420	}
4421
4422	#endif /* CONFIG_MEMCG */
4423
4424	/******************************************************************************
4425	* the eviction
4426	******************************************************************************/
4427
4428	static bool sort_folio(struct lruvec lruvec, struct* folio folio, struct* scan_control *sc,
4429	int tier_idx)
4430	{
4431	bool success;
4432	bool dirty, writeback;
4433	int gen = folio_lru_gen(folio);
4434	int type = folio_is_file_lru(folio);
4435	int zone = folio_zonenum(folio);
4436	int delta = folio_nr_pages(folio);
4437	int refs = folio_lru_refs(folio);
4438	bool workingset = folio_test_workingset(folio);
4439	int tier = lru_tier_from_refs(refs, workingset);
4440	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4441
4442	VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
4443
4444	/ unevictable /
4445	if (!folio_evictable(folio)) {
4446	success = lru_gen_del_folio(lruvec, folio, reclaiming: true);
4447	VM_WARN_ON_ONCE_FOLIO(!success, folio);
4448	folio_set_unevictable(folio);
4449	lruvec_add_folio(lruvec, folio);
4450	__count_vm_events(item: UNEVICTABLE_PGCULLED, delta);
4451	return true;
4452	}
4453
4454	/ promoted /
4455	if (gen != lru_gen_from_seq(seq: lrugen->min_seq[type])) {
4456	list_move(list: &folio->lru, head: &lrugen->folios[gen][type][zone]);
4457	return true;
4458	}
4459
4460	/ protected /
4461	if (tier > tier_idx \|\| refs + workingset == BIT(LRU_REFS_WIDTH) + `1`) {
4462	gen = folio_inc_gen(lruvec, folio, reclaiming: false);
4463	list_move(list: &folio->lru, head: &lrugen->folios[gen][type][zone]);
4464
4465	/ don't count the workingset being lazily promoted /
4466	if (refs + workingset != BIT(LRU_REFS_WIDTH) + `1`) {
4467	int hist = lru_hist_from_seq(seq: lrugen->min_seq[type]);
4468
4469	WRITE_ONCE(lrugen->protected[hist][type][tier],
4470	lrugen->protected[hist][type][tier] + delta);
4471	}
4472	return true;
4473	}
4474
4475	/ ineligible /
4476	if (zone > sc->reclaim_idx) {
4477	gen = folio_inc_gen(lruvec, folio, reclaiming: false);
4478	list_move_tail(list: &folio->lru, head: &lrugen->folios[gen][type][zone]);
4479	return true;
4480	}
4481
4482	dirty = folio_test_dirty(folio);
4483	writeback = folio_test_writeback(folio);
4484	if (type == LRU_GEN_FILE && dirty) {
4485	sc->nr.file_taken += delta;
4486	if (!writeback)
4487	sc->nr.unqueued_dirty += delta;
4488	}
4489
4490	/ waiting for writeback /
4491	if (writeback \|\| (type == LRU_GEN_FILE && dirty)) {
4492	gen = folio_inc_gen(lruvec, folio, reclaiming: true);
4493	list_move(list: &folio->lru, head: &lrugen->folios[gen][type][zone]);
4494	return true;
4495	}
4496
4497	return false;
4498	}
4499
4500	static bool isolate_folio(struct lruvec lruvec, struct* folio folio, struct* scan_control *sc)
4501	{
4502	bool success;
4503
4504	/ swap constrained /
4505	if (!(sc->gfp_mask & __GFP_IO) &&
4506	(folio_test_dirty(folio) \|\|
4507	(folio_test_anon(folio) && !folio_test_swapcache(folio))))
4508	return false;
4509
4510	/ raced with release_pages() /
4511	if (!folio_try_get(folio))
4512	return false;
4513
4514	/ raced with another isolation /
4515	if (!folio_test_clear_lru(folio)) {
4516	folio_put(folio);
4517	return false;
4518	}
4519
4520	/ see the comment on LRU_REFS_FLAGS /
4521	if (!folio_test_referenced(folio))
4522	set_mask_bits(&folio->flags.f, LRU_REFS_MASK, `0`);
4523
4524	/ for shrink_folio_list() /
4525	folio_clear_reclaim(folio);
4526
4527	success = lru_gen_del_folio(lruvec, folio, reclaiming: true);
4528	VM_WARN_ON_ONCE_FOLIO(!success, folio);
4529
4530	return true;
4531	}
4532
4533	static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
4534	struct scan_control sc, int* type, int tier,
4535	struct list_head *list)
4536	{
4537	int i;
4538	int gen;
4539	enum vm_event_item item;
4540	int sorted = `0`;
4541	int scanned = `0`;
4542	int isolated = `0`;
4543	int skipped = `0`;
4544	int scan_batch = min(nr_to_scan, MAX_LRU_BATCH);
4545	int remaining = scan_batch;
4546	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4547	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4548
4549	VM_WARN_ON_ONCE(!list_empty(list));
4550
4551	if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
4552	return `0`;
4553
4554	gen = lru_gen_from_seq(seq: lrugen->min_seq[type]);
4555
4556	for (i = MAX_NR_ZONES; i > `0`; i--) {
4557	LIST_HEAD(moved);
4558	int skipped_zone = `0`;
4559	int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
4560	struct list_head *head = &lrugen->folios[gen][type][zone];
4561
4562	while (!list_empty(head)) {
4563	struct folio *folio = lru_to_folio(head);
4564	int delta = folio_nr_pages(folio);
4565
4566	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
4567	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
4568	VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
4569	VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
4570
4571	scanned += delta;
4572
4573	if (sort_folio(lruvec, folio, sc, tier_idx: tier))
4574	sorted += delta;
4575	else if (isolate_folio(lruvec, folio, sc)) {
4576	list_add(new: &folio->lru, head: list);
4577	isolated += delta;
4578	} else {
4579	list_move(list: &folio->lru, head: &moved);
4580	skipped_zone += delta;
4581	}
4582
4583	if (!--remaining \|\| max(isolated, skipped_zone) >= MIN_LRU_BATCH)
4584	break;
4585	}
4586
4587	if (skipped_zone) {
4588	list_splice(list: &moved, head);
4589	__count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone);
4590	skipped += skipped_zone;
4591	}
4592
4593	if (!remaining \|\| isolated >= MIN_LRU_BATCH)
4594	break;
4595	}
4596
4597	item = PGSCAN_KSWAPD + reclaimer_offset(sc);
4598	if (!cgroup_reclaim(sc)) {
4599	__count_vm_events(item, delta: isolated);
4600	__count_vm_events(item: PGREFILL, delta: sorted);
4601	}
4602	count_memcg_events(memcg, idx: item, count: isolated);
4603	count_memcg_events(memcg, idx: PGREFILL, count: sorted);
4604	__count_vm_events(item: PGSCAN_ANON + type, delta: isolated);
4605	trace_mm_vmscan_lru_isolate(highest_zoneidx: sc->reclaim_idx, order: sc->order, nr_requested: scan_batch,
4606	nr_scanned: scanned, nr_skipped: skipped, nr_taken: isolated,
4607	lru: type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
4608	if (type == LRU_GEN_FILE)
4609	sc->nr.file_taken += isolated;
4610	/*
4611	* There might not be eligible folios due to reclaim_idx. Check the
4612	* remaining to prevent livelock if it's not making progress.
4613	*/
4614	return isolated \|\| !remaining ? scanned : `0`;
4615	}
4616
4617	static int get_tier_idx(struct lruvec lruvec, int* type)
4618	{
4619	int tier;
4620	struct ctrl_pos sp, pv;
4621
4622	/*
4623	* To leave a margin for fluctuations, use a larger gain factor (2:3).
4624	* This value is chosen because any other tier would have at least twice
4625	* as many refaults as the first tier.
4626	*/
4627	read_ctrl_pos(lruvec, type, tier: `0`, gain: `2`, pos: &sp);
4628	for (tier = `1`; tier < MAX_NR_TIERS; tier++) {
4629	read_ctrl_pos(lruvec, type, tier, gain: `3`, pos: &pv);
4630	if (!positive_ctrl_err(sp: &sp, pv: &pv))
4631	break;
4632	}
4633
4634	return tier - `1`;
4635	}
4636
4637	static int get_type_to_scan(struct lruvec lruvec, int* swappiness)
4638	{
4639	struct ctrl_pos sp, pv;
4640
4641	if (swappiness <= MIN_SWAPPINESS + `1`)
4642	return LRU_GEN_FILE;
4643
4644	if (swappiness >= MAX_SWAPPINESS)
4645	return LRU_GEN_ANON;
4646	/*
4647	* Compare the sum of all tiers of anon with that of file to determine
4648	* which type to scan.
4649	*/
4650	read_ctrl_pos(lruvec, type: LRU_GEN_ANON, MAX_NR_TIERS, gain: swappiness, pos: &sp);
4651	read_ctrl_pos(lruvec, type: LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, pos: &pv);
4652
4653	return positive_ctrl_err(sp: &sp, pv: &pv);
4654	}
4655
4656	static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
4657	struct scan_control sc, int* swappiness,
4658	int type_scanned, struct* list_head *list)
4659	{
4660	int i;
4661	int type = get_type_to_scan(lruvec, swappiness);
4662
4663	for_each_evictable_type(i, swappiness) {
4664	int scanned;
4665	int tier = get_tier_idx(lruvec, type);
4666
4667	*type_scanned = type;
4668
4669	scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
4670	if (scanned)
4671	return scanned;
4672
4673	type = !type;
4674	}
4675
4676	return `0`;
4677	}
4678
4679	static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
4680	struct scan_control sc, int* swappiness)
4681	{
4682	int type;
4683	int scanned;
4684	int reclaimed;
4685	LIST_HEAD(list);
4686	LIST_HEAD(clean);
4687	struct folio *folio;
4688	struct folio *next;
4689	enum vm_event_item item;
4690	struct reclaim_stat stat;
4691	struct lru_gen_mm_walk *walk;
4692	bool skip_retry = false;
4693	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4694	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4695	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4696
4697	spin_lock_irq(lock: &lruvec->lru_lock);
4698
4699	scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, type_scanned: &type, list: &list);
4700
4701	scanned += try_to_inc_min_seq(lruvec, swappiness);
4702
4703	if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
4704	scanned = `0`;
4705
4706	spin_unlock_irq(lock: &lruvec->lru_lock);
4707
4708	if (list_empty(head: &list))
4709	return scanned;
4710	retry:
4711	reclaimed = shrink_folio_list(folio_list: &list, pgdat, sc, stat: &stat, ignore_references: false, memcg);
4712	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
4713	sc->nr_reclaimed += reclaimed;
4714	trace_mm_vmscan_lru_shrink_inactive(nid: pgdat->node_id,
4715	nr_scanned: scanned, nr_reclaimed: reclaimed, stat: &stat, priority: sc->priority,
4716	file: type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
4717
4718	list_for_each_entry_safe_reverse(folio, next, &list, lru) {
4719	DEFINE_MIN_SEQ(lruvec);
4720
4721	if (!folio_evictable(folio)) {
4722	list_del(entry: &folio->lru);
4723	folio_putback_lru(folio);
4724	continue;
4725	}
4726
4727	/ retry folios that may have missed folio_rotate_reclaimable() /
4728	if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
4729	!folio_test_dirty(folio) && !folio_test_writeback(folio)) {
4730	list_move(list: &folio->lru, head: &clean);
4731	continue;
4732	}
4733
4734	/ don't add rejected folios to the oldest generation /
4735	if (lru_gen_folio_seq(lruvec, folio, reclaiming: false) == min_seq[type])
4736	set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
4737	}
4738
4739	spin_lock_irq(lock: &lruvec->lru_lock);
4740
4741	move_folios_to_lru(lruvec, list: &list);
4742
4743	walk = current->reclaim_state->mm_walk;
4744	if (walk && walk->batched) {
4745	walk->lruvec = lruvec;
4746	reset_batch_size(walk);
4747	}
4748
4749	mod_lruvec_state(lruvec, idx: PGDEMOTE_KSWAPD + reclaimer_offset(sc),
4750	val: stat.nr_demoted);
4751
4752	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
4753	if (!cgroup_reclaim(sc))
4754	__count_vm_events(item, delta: reclaimed);
4755	count_memcg_events(memcg, idx: item, count: reclaimed);
4756	__count_vm_events(item: PGSTEAL_ANON + type, delta: reclaimed);
4757
4758	spin_unlock_irq(lock: &lruvec->lru_lock);
4759
4760	list_splice_init(list: &clean, head: &list);
4761
4762	if (!list_empty(head: &list)) {
4763	skip_retry = true;
4764	goto retry;
4765	}
4766
4767	return scanned;
4768	}
4769
4770	static bool should_run_aging(struct lruvec lruvec, unsigned* long max_seq,
4771	int swappiness, unsigned long *nr_to_scan)
4772	{
4773	int gen, type, zone;
4774	unsigned long size = `0`;
4775	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4776	DEFINE_MIN_SEQ(lruvec);
4777
4778	*nr_to_scan = `0`;
4779	/ have to run aging, since eviction is not possible anymore /
4780	if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
4781	return true;
4782
4783	for_each_evictable_type(type, swappiness) {
4784	unsigned long seq;
4785
4786	for (seq = min_seq[type]; seq <= max_seq; seq++) {
4787	gen = lru_gen_from_seq(seq);
4788
4789	for (zone = `0`; zone < MAX_NR_ZONES; zone++)
4790	size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), `0L`);
4791	}
4792	}
4793
4794	*nr_to_scan = size;
4795	/ better to run aging even though eviction is still possible /
4796	return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
4797	}
4798
4799	/*
4800	* For future optimizations:
4801	* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
4802	* reclaim.
4803	*/
4804	static long get_nr_to_scan(struct lruvec lruvec, struct* scan_control sc, int* swappiness)
4805	{
4806	bool success;
4807	unsigned long nr_to_scan;
4808	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4809	DEFINE_MAX_SEQ(lruvec);
4810
4811	if (mem_cgroup_below_min(target: sc->target_mem_cgroup, memcg))
4812	return -`1`;
4813
4814	success = should_run_aging(lruvec, max_seq, swappiness, nr_to_scan: &nr_to_scan);
4815
4816	/ try to scrape all its memory if this memcg was deleted /
4817	if (nr_to_scan && !mem_cgroup_online(memcg))
4818	return nr_to_scan;
4819
4820	nr_to_scan = apply_proportional_protection(memcg, sc, scan: nr_to_scan);
4821
4822	/ try to get away with not aging at the default priority /
4823	if (!success \|\| sc->priority == DEF_PRIORITY)
4824	return nr_to_scan >> sc->priority;
4825
4826	/ stop scanning this lruvec as it's low on cold folios /
4827	return try_to_inc_max_seq(lruvec, seq: max_seq, swappiness, force_scan: false) ? -`1` : `0`;
4828	}
4829
4830	static bool should_abort_scan(struct lruvec lruvec, struct* scan_control *sc)
4831	{
4832	int i;
4833	enum zone_watermarks mark;
4834
4835	/ don't abort memcg reclaim to ensure fairness /
4836	if (!root_reclaim(sc))
4837	return false;
4838
4839	if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
4840	return true;
4841
4842	/ check the order to exclude compaction-induced reclaim /
4843	if (!current_is_kswapd() \|\| sc->order)
4844	return false;
4845
4846	mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
4847	WMARK_PROMO : WMARK_HIGH;
4848
4849	for (i = `0`; i <= sc->reclaim_idx; i++) {
4850	struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
4851	unsigned long size = wmark_pages(z: zone, w: mark) + MIN_LRU_BATCH;
4852
4853	if (managed_zone(zone) && !zone_watermark_ok(z: zone, order: `0`, mark: size, highest_zoneidx: sc->reclaim_idx, alloc_flags: `0`))
4854	return false;
4855	}
4856
4857	/ kswapd should abort if all eligible zones are safe /
4858	return true;
4859	}
4860
4861	static bool try_to_shrink_lruvec(struct lruvec lruvec, struct* scan_control *sc)
4862	{
4863	long nr_to_scan;
4864	unsigned long scanned = `0`;
4865	int swappiness = get_swappiness(lruvec, sc);
4866
4867	while (true) {
4868	int delta;
4869
4870	nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
4871	if (nr_to_scan <= `0`)
4872	break;
4873
4874	delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
4875	if (!delta)
4876	break;
4877
4878	scanned += delta;
4879	if (scanned >= nr_to_scan)
4880	break;
4881
4882	if (should_abort_scan(lruvec, sc))
4883	break;
4884
4885	cond_resched();
4886	}
4887
4888	/*
4889	* If too many file cache in the coldest generation can't be evicted
4890	* due to being dirty, wake up the flusher.
4891	*/
4892	if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
4893	wakeup_flusher_threads(reason: WB_REASON_VMSCAN);
4894
4895	/ whether this lruvec should be rotated /
4896	return nr_to_scan < `0`;
4897	}
4898
4899	static int shrink_one(struct lruvec lruvec, struct* scan_control *sc)
4900	{
4901	bool success;
4902	unsigned long scanned = sc->nr_scanned;
4903	unsigned long reclaimed = sc->nr_reclaimed;
4904	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4905	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4906
4907	/ lru_gen_age_node() called mem_cgroup_calculate_protection() /
4908	if (mem_cgroup_below_min(NULL, memcg))
4909	return MEMCG_LRU_YOUNG;
4910
4911	if (mem_cgroup_below_low(NULL, memcg)) {
4912	/ see the comment on MEMCG_NR_GENS /
4913	if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
4914	return MEMCG_LRU_TAIL;
4915
4916	memcg_memory_event(memcg, event: MEMCG_LOW);
4917	}
4918
4919	success = try_to_shrink_lruvec(lruvec, sc);
4920
4921	shrink_slab(gfp_mask: sc->gfp_mask, nid: pgdat->node_id, memcg, priority: sc->priority);
4922
4923	if (!sc->proactive)
4924	vmpressure(gfp: sc->gfp_mask, memcg, tree: false, scanned: sc->nr_scanned - scanned,
4925	reclaimed: sc->nr_reclaimed - reclaimed);
4926
4927	flush_reclaim_state(sc);
4928
4929	if (success && mem_cgroup_online(memcg))
4930	return MEMCG_LRU_YOUNG;
4931
4932	if (!success && lruvec_is_sizable(lruvec, sc))
4933	return `0`;
4934
4935	/ one retry if offlined or too small /
4936	return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
4937	MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
4938	}
4939
4940	static void shrink_many(struct pglist_data pgdat, struct* scan_control *sc)
4941	{
4942	int op;
4943	int gen;
4944	int bin;
4945	int first_bin;
4946	struct lruvec *lruvec;
4947	struct lru_gen_folio *lrugen;
4948	struct mem_cgroup *memcg;
4949	struct hlist_nulls_node *pos;
4950
4951	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
4952	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
4953	restart:
4954	op = `0`;
4955	memcg = NULL;
4956
4957	rcu_read_lock();
4958
4959	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
4960	if (op) {
4961	lru_gen_rotate_memcg(lruvec, op);
4962	op = `0`;
4963	}
4964
4965	mem_cgroup_put(memcg);
4966	memcg = NULL;
4967
4968	if (gen != READ_ONCE(lrugen->gen))
4969	continue;
4970
4971	lruvec = container_of(lrugen, struct lruvec, lrugen);
4972	memcg = lruvec_memcg(lruvec);
4973
4974	if (!mem_cgroup_tryget(memcg)) {
4975	lru_gen_release_memcg(memcg);
4976	memcg = NULL;
4977	continue;
4978	}
4979
4980	rcu_read_unlock();
4981
4982	op = shrink_one(lruvec, sc);
4983
4984	rcu_read_lock();
4985
4986	if (should_abort_scan(lruvec, sc))
4987	break;
4988	}
4989
4990	rcu_read_unlock();
4991
4992	if (op)
4993	lru_gen_rotate_memcg(lruvec, op);
4994
4995	mem_cgroup_put(memcg);
4996
4997	if (!is_a_nulls(ptr: pos))
4998	return;
4999
5000	/ restart if raced with lru_gen_rotate_memcg() /
5001	if (gen != get_nulls_value(ptr: pos))
5002	goto restart;
5003
5004	/ try the rest of the bins of the current generation /
5005	bin = get_memcg_bin(bin + `1`);
5006	if (bin != first_bin)
5007	goto restart;
5008	}
5009
5010	static void lru_gen_shrink_lruvec(struct lruvec lruvec, struct* scan_control *sc)
5011	{
5012	struct blk_plug plug;
5013
5014	VM_WARN_ON_ONCE(root_reclaim(sc));
5015	VM_WARN_ON_ONCE(!sc->may_writepage \|\| !sc->may_unmap);
5016
5017	lru_add_drain();
5018
5019	blk_start_plug(&plug);
5020
5021	set_mm_walk(NULL, force_alloc: sc->proactive);
5022
5023	if (try_to_shrink_lruvec(lruvec, sc))
5024	lru_gen_rotate_memcg(lruvec, op: MEMCG_LRU_YOUNG);
5025
5026	clear_mm_walk();
5027
5028	blk_finish_plug(&plug);
5029	}
5030
5031	static void lru_gen_shrink_node(struct pglist_data pgdat, struct* scan_control *sc)
5032	{
5033	struct blk_plug plug;
5034	unsigned long reclaimed = sc->nr_reclaimed;
5035
5036	VM_WARN_ON_ONCE(!root_reclaim(sc));
5037
5038	/*
5039	* Unmapped clean folios are already prioritized. Scanning for more of
5040	* them is likely futile and can cause high reclaim latency when there
5041	* is a large number of memcgs.
5042	*/
5043	if (!sc->may_writepage \|\| !sc->may_unmap)
5044	goto done;
5045
5046	lru_add_drain();
5047
5048	blk_start_plug(&plug);
5049
5050	set_mm_walk(pgdat, force_alloc: sc->proactive);
5051
5052	set_initial_priority(pgdat, sc);
5053
5054	if (current_is_kswapd())
5055	sc->nr_reclaimed = `0`;
5056
5057	if (mem_cgroup_disabled())
5058	shrink_one(lruvec: &pgdat->__lruvec, sc);
5059	else
5060	shrink_many(pgdat, sc);
5061
5062	if (current_is_kswapd())
5063	sc->nr_reclaimed += reclaimed;
5064
5065	clear_mm_walk();
5066
5067	blk_finish_plug(&plug);
5068	done:
5069	if (sc->nr_reclaimed > reclaimed)
5070	atomic_set(v: &pgdat->kswapd_failures, i: `0`);
5071	}
5072
5073	/******************************************************************************
5074	* state change
5075	******************************************************************************/
5076
5077	static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
5078	{
5079	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5080
5081	if (lrugen->enabled) {
5082	enum lru_list lru;
5083
5084	for_each_evictable_lru(lru) {
5085	if (!list_empty(head: &lruvec->lists[lru]))
5086	return false;
5087	}
5088	} else {
5089	int gen, type, zone;
5090
5091	for_each_gen_type_zone(gen, type, zone) {
5092	if (!list_empty(&lrugen->folios[gen][type][zone]))
5093	return false;
5094	}
5095	}
5096
5097	return true;
5098	}
5099
5100	static bool fill_evictable(struct lruvec *lruvec)
5101	{
5102	enum lru_list lru;
5103	int remaining = MAX_LRU_BATCH;
5104
5105	for_each_evictable_lru(lru) {
5106	int type = is_file_lru(lru);
5107	bool active = is_active_lru(lru);
5108	struct list_head *head = &lruvec->lists[lru];
5109
5110	while (!list_empty(head)) {
5111	bool success;
5112	struct folio *folio = lru_to_folio(head);
5113
5114	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5115	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
5116	VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5117	VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -`1`, folio);
5118
5119	lruvec_del_folio(lruvec, folio);
5120	success = lru_gen_add_folio(lruvec, folio, reclaiming: false);
5121	VM_WARN_ON_ONCE(!success);
5122
5123	if (!--remaining)
5124	return false;
5125	}
5126	}
5127
5128	return true;
5129	}
5130
5131	static bool drain_evictable(struct lruvec *lruvec)
5132	{
5133	int gen, type, zone;
5134	int remaining = MAX_LRU_BATCH;
5135
5136	for_each_gen_type_zone(gen, type, zone) {
5137	struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
5138
5139	while (!list_empty(head)) {
5140	bool success;
5141	struct folio *folio = lru_to_folio(head);
5142
5143	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5144	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
5145	VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5146	VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
5147
5148	success = lru_gen_del_folio(lruvec, folio, false);
5149	VM_WARN_ON_ONCE(!success);
5150	lruvec_add_folio(lruvec, folio);
5151
5152	if (!--remaining)
5153	return false;
5154	}
5155	}
5156
5157	return true;
5158	}
5159
5160	static void lru_gen_change_state(bool enabled)
5161	{
5162	static DEFINE_MUTEX(state_mutex);
5163
5164	struct mem_cgroup *memcg;
5165
5166	cgroup_lock();
5167	cpus_read_lock();
5168	get_online_mems();
5169	mutex_lock(&state_mutex);
5170
5171	if (enabled == lru_gen_enabled())
5172	goto unlock;
5173
5174	if (enabled)
5175	static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5176	else
5177	static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5178
5179	memcg = mem_cgroup_iter(NULL, NULL, NULL);
5180	do {
5181	int nid;
5182
5183	for_each_node(nid) {
5184	struct lruvec *lruvec = get_lruvec(memcg, nid);
5185
5186	spin_lock_irq(lock: &lruvec->lru_lock);
5187
5188	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
5189	VM_WARN_ON_ONCE(!state_is_valid(lruvec));
5190
5191	lruvec->lrugen.enabled = enabled;
5192
5193	while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
5194	spin_unlock_irq(lock: &lruvec->lru_lock);
5195	cond_resched();
5196	spin_lock_irq(lock: &lruvec->lru_lock);
5197	}
5198
5199	spin_unlock_irq(lock: &lruvec->lru_lock);
5200	}
5201
5202	cond_resched();
5203	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5204	unlock:
5205	mutex_unlock(lock: &state_mutex);
5206	put_online_mems();
5207	cpus_read_unlock();
5208	cgroup_unlock();
5209	}
5210
5211	/******************************************************************************
5212	* sysfs interface
5213	******************************************************************************/
5214
5215	static ssize_t min_ttl_ms_show(struct kobject kobj, struct* kobj_attribute attr, char* *buf)
5216	{
5217	return sysfs_emit(buf, fmt: "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
5218	}
5219
5220	/ see Documentation/admin-guide/mm/multigen_lru.rst for details /
5221	static ssize_t min_ttl_ms_store(struct kobject kobj, struct* kobj_attribute *attr,
5222	const char *buf, size_t len)
5223	{
5224	unsigned int msecs;
5225
5226	if (kstrtouint(s: buf, base: `0`, res: &msecs))
5227	return -EINVAL;
5228
5229	WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
5230
5231	return len;
5232	}
5233
5234	static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
5235
5236	static ssize_t enabled_show(struct kobject kobj, struct* kobj_attribute attr, char* *buf)
5237	{
5238	unsigned int caps = `0`;
5239
5240	if (get_cap(LRU_GEN_CORE))
5241	caps \|= BIT(LRU_GEN_CORE);
5242
5243	if (should_walk_mmu())
5244	caps \|= BIT(LRU_GEN_MM_WALK);
5245
5246	if (should_clear_pmd_young())
5247	caps \|= BIT(LRU_GEN_NONLEAF_YOUNG);
5248
5249	return sysfs_emit(buf, fmt: "0x%04x\n", caps);
5250	}
5251
5252	/ see Documentation/admin-guide/mm/multigen_lru.rst for details /
5253	static ssize_t enabled_store(struct kobject kobj, struct* kobj_attribute *attr,
5254	const char *buf, size_t len)
5255	{
5256	int i;
5257	unsigned int caps;
5258
5259	if (tolower(*buf) == `'n'`)
5260	caps = `0`;
5261	else if (tolower(*buf) == `'y'`)
5262	caps = -`1`;
5263	else if (kstrtouint(s: buf, base: `0`, res: &caps))
5264	return -EINVAL;
5265
5266	for (i = `0`; i < NR_LRU_GEN_CAPS; i++) {
5267	bool enabled = caps & BIT(i);
5268
5269	if (i == LRU_GEN_CORE)
5270	lru_gen_change_state(enabled);
5271	else if (enabled)
5272	static_branch_enable(&lru_gen_caps[i]);
5273	else
5274	static_branch_disable(&lru_gen_caps[i]);
5275	}
5276
5277	return len;
5278	}
5279
5280	static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
5281
5282	static struct attribute *lru_gen_attrs[] = {
5283	&lru_gen_min_ttl_attr.attr,
5284	&lru_gen_enabled_attr.attr,
5285	NULL
5286	};
5287
5288	static const struct attribute_group lru_gen_attr_group = {
5289	.name = "lru_gen",
5290	.attrs = lru_gen_attrs,
5291	};
5292
5293	/******************************************************************************
5294	* debugfs interface
5295	******************************************************************************/
5296
5297	static void lru_gen_seq_start(struct* seq_file m, loff_t pos)
5298	{
5299	struct mem_cgroup *memcg;
5300	loff_t nr_to_skip = *pos;
5301
5302	m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
5303	if (!m->private)
5304	return ERR_PTR(error: -ENOMEM);
5305
5306	memcg = mem_cgroup_iter(NULL, NULL, NULL);
5307	do {
5308	int nid;
5309
5310	for_each_node_state(nid, N_MEMORY) {
5311	if (!nr_to_skip--)
5312	return get_lruvec(memcg, nid);
5313	}
5314	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5315
5316	return NULL;
5317	}
5318
5319	static void lru_gen_seq_stop(struct seq_file m, void* *v)
5320	{
5321	if (!IS_ERR_OR_NULL(ptr: v))
5322	mem_cgroup_iter_break(NULL, lruvec_memcg(lruvec: v));
5323
5324	kvfree(addr: m->private);
5325	m->private = NULL;
5326	}
5327
5328	static void lru_gen_seq_next(struct* seq_file m, void* v, loff_t pos)
5329	{
5330	int nid = lruvec_pgdat(lruvec: v)->node_id;
5331	struct mem_cgroup *memcg = lruvec_memcg(lruvec: v);
5332
5333	++*pos;
5334
5335	nid = next_memory_node(nid);
5336	if (nid == MAX_NUMNODES) {
5337	memcg = mem_cgroup_iter(NULL, memcg, NULL);
5338	if (!memcg)
5339	return NULL;
5340
5341	nid = first_memory_node;
5342	}
5343
5344	return get_lruvec(memcg, nid);
5345	}
5346
5347	static void lru_gen_seq_show_full(struct seq_file m, struct* lruvec *lruvec,
5348	unsigned long max_seq, unsigned long *min_seq,
5349	unsigned long seq)
5350	{
5351	int i;
5352	int type, tier;
5353	int hist = lru_hist_from_seq(seq);
5354	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5355	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5356
5357	for (tier = `0`; tier < MAX_NR_TIERS; tier++) {
5358	seq_printf(m, fmt: " %10d", tier);
5359	for (type = `0`; type < ANON_AND_FILE; type++) {
5360	const char *s = "xxx";
5361	unsigned long n[`3`] = {};
5362
5363	if (seq == max_seq) {
5364	s = "RTx";
5365	n[`0`] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
5366	n[`1`] = READ_ONCE(lrugen->avg_total[type][tier]);
5367	} else if (seq == min_seq[type] \|\| NR_HIST_GENS > `1`) {
5368	s = "rep";
5369	n[`0`] = atomic_long_read(v: &lrugen->refaulted[hist][type][tier]);
5370	n[`1`] = atomic_long_read(v: &lrugen->evicted[hist][type][tier]);
5371	n[`2`] = READ_ONCE(lrugen->protected[hist][type][tier]);
5372	}
5373
5374	for (i = `0`; i < `3`; i++)
5375	seq_printf(m, fmt: " %10lu%c", n[i], s[i]);
5376	}
5377	seq_putc(m, c: `'\n'`);
5378	}
5379
5380	if (!mm_state)
5381	return;
5382
5383	seq_puts(m, s: " ");
5384	for (i = `0`; i < NR_MM_STATS; i++) {
5385	const char *s = "xxxx";
5386	unsigned long n = `0`;
5387
5388	if (seq == max_seq && NR_HIST_GENS == `1`) {
5389	s = "TYFA";
5390	n = READ_ONCE(mm_state->stats[hist][i]);
5391	} else if (seq != max_seq && NR_HIST_GENS > `1`) {
5392	s = "tyfa";
5393	n = READ_ONCE(mm_state->stats[hist][i]);
5394	}
5395
5396	seq_printf(m, fmt: " %10lu%c", n, s[i]);
5397	}
5398	seq_putc(m, c: `'\n'`);
5399	}
5400
5401	/ see Documentation/admin-guide/mm/multigen_lru.rst for details /
5402	static int lru_gen_seq_show(struct seq_file m, void* *v)
5403	{
5404	unsigned long seq;
5405	bool full = debugfs_get_aux_num(m->file);
5406	struct lruvec *lruvec = v;
5407	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5408	int nid = lruvec_pgdat(lruvec)->node_id;
5409	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5410	DEFINE_MAX_SEQ(lruvec);
5411	DEFINE_MIN_SEQ(lruvec);
5412
5413	if (nid == first_memory_node) {
5414	const char *path = memcg ? m->private : "";
5415
5416	#ifdef CONFIG_MEMCG
5417	if (memcg)
5418	cgroup_path(cgrp: memcg->css.cgroup, buf: m->private, PATH_MAX);
5419	#endif
5420	seq_printf(m, fmt: "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
5421	}
5422
5423	seq_printf(m, fmt: " node %5d\n", nid);
5424
5425	if (!full)
5426	seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / `2`);
5427	else if (max_seq >= MAX_NR_GENS)
5428	seq = max_seq - MAX_NR_GENS + `1`;
5429	else
5430	seq = `0`;
5431
5432	for (; seq <= max_seq; seq++) {
5433	int type, zone;
5434	int gen = lru_gen_from_seq(seq);
5435	unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
5436
5437	seq_printf(m, fmt: " %10lu %10u", seq, jiffies_to_msecs(j: jiffies - birth));
5438
5439	for (type = `0`; type < ANON_AND_FILE; type++) {
5440	unsigned long size = `0`;
5441	char mark = full && seq < min_seq[type] ? `'x'` : `' '`;
5442
5443	for (zone = `0`; zone < MAX_NR_ZONES; zone++)
5444	size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), `0L`);
5445
5446	seq_printf(m, fmt: " %10lu%c", size, mark);
5447	}
5448
5449	seq_putc(m, c: `'\n'`);
5450
5451	if (full)
5452	lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
5453	}
5454
5455	return `0`;
5456	}
5457
5458	static const struct seq_operations lru_gen_seq_ops = {
5459	.start = lru_gen_seq_start,
5460	.stop = lru_gen_seq_stop,
5461	.next = lru_gen_seq_next,
5462	.show = lru_gen_seq_show,
5463	};
5464
5465	static int run_aging(struct lruvec lruvec, unsigned* long seq,
5466	int swappiness, bool force_scan)
5467	{
5468	DEFINE_MAX_SEQ(lruvec);
5469
5470	if (seq > max_seq)
5471	return -EINVAL;
5472
5473	return try_to_inc_max_seq(lruvec, seq: max_seq, swappiness, force_scan) ? `0` : -EEXIST;
5474	}
5475
5476	static int run_eviction(struct lruvec lruvec, unsigned* long seq, struct scan_control *sc,
5477	int swappiness, unsigned long nr_to_reclaim)
5478	{
5479	DEFINE_MAX_SEQ(lruvec);
5480
5481	if (seq + MIN_NR_GENS > max_seq)
5482	return -EINVAL;
5483
5484	sc->nr_reclaimed = `0`;
5485
5486	while (!signal_pending(current)) {
5487	DEFINE_MIN_SEQ(lruvec);
5488
5489	if (seq < evictable_min_seq(min_seq, swappiness))
5490	return `0`;
5491
5492	if (sc->nr_reclaimed >= nr_to_reclaim)
5493	return `0`;
5494
5495	if (!evict_folios(nr_to_scan: nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
5496	swappiness))
5497	return `0`;
5498
5499	cond_resched();
5500	}
5501
5502	return -EINTR;
5503	}
5504
5505	static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
5506	struct scan_control sc, int* swappiness, unsigned long opt)
5507	{
5508	struct lruvec *lruvec;
5509	int err = -EINVAL;
5510	struct mem_cgroup *memcg = NULL;
5511
5512	if (nid < `0` \|\| nid >= MAX_NUMNODES \|\| !node_state(node: nid, state: N_MEMORY))
5513	return -EINVAL;
5514
5515	if (!mem_cgroup_disabled()) {
5516	rcu_read_lock();
5517
5518	memcg = mem_cgroup_from_id(id: memcg_id);
5519	if (!mem_cgroup_tryget(memcg))
5520	memcg = NULL;
5521
5522	rcu_read_unlock();
5523
5524	if (!memcg)
5525	return -EINVAL;
5526	}
5527
5528	if (memcg_id != mem_cgroup_id(memcg))
5529	goto done;
5530
5531	sc->target_mem_cgroup = memcg;
5532	lruvec = get_lruvec(memcg, nid);
5533
5534	if (swappiness < MIN_SWAPPINESS)
5535	swappiness = get_swappiness(lruvec, sc);
5536	else if (swappiness > SWAPPINESS_ANON_ONLY)
5537	goto done;
5538
5539	switch (cmd) {
5540	case `'+'`:
5541	err = run_aging(lruvec, seq, swappiness, force_scan: opt);
5542	break;
5543	case `'-'`:
5544	err = run_eviction(lruvec, seq, sc, swappiness, nr_to_reclaim: opt);
5545	break;
5546	}
5547	done:
5548	mem_cgroup_put(memcg);
5549
5550	return err;
5551	}
5552
5553	/ see Documentation/admin-guide/mm/multigen_lru.rst for details /
5554	static ssize_t lru_gen_seq_write(struct file file, const* char __user *src,
5555	size_t len, loff_t *pos)
5556	{
5557	void *buf;
5558	char cur, next;
5559	unsigned int flags;
5560	struct blk_plug plug;
5561	int err = -EINVAL;
5562	struct scan_control sc = {
5563	.may_writepage = true,
5564	.may_unmap = true,
5565	.may_swap = true,
5566	.reclaim_idx = MAX_NR_ZONES - `1`,
5567	.gfp_mask = GFP_KERNEL,
5568	.proactive = true,
5569	};
5570
5571	buf = kvmalloc(len + `1`, GFP_KERNEL);
5572	if (!buf)
5573	return -ENOMEM;
5574
5575	if (copy_from_user(to: buf, from: src, n: len)) {
5576	kvfree(addr: buf);
5577	return -EFAULT;
5578	}
5579
5580	set_task_reclaim_state(current, rs: &sc.reclaim_state);
5581	flags = memalloc_noreclaim_save();
5582	blk_start_plug(&plug);
5583	if (!set_mm_walk(NULL, force_alloc: true)) {
5584	err = -ENOMEM;
5585	goto done;
5586	}
5587
5588	next = buf;
5589	next[len] = `'\0'`;
5590
5591	while ((cur = strsep(&next, ",;\n"))) {
5592	int n;
5593	int end;
5594	char cmd, swap_string[`5`];
5595	unsigned int memcg_id;
5596	unsigned int nid;
5597	unsigned long seq;
5598	unsigned int swappiness;
5599	unsigned long opt = -`1`;
5600
5601	cur = skip_spaces(cur);
5602	if (!*cur)
5603	continue;
5604
5605	n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
5606	&seq, &end, swap_string, &end, &opt, &end);
5607	if (n < `4` \|\| cur[end]) {
5608	err = -EINVAL;
5609	break;
5610	}
5611
5612	if (n == `4`) {
5613	swappiness = -`1`;
5614	} else if (!strcmp("max", swap_string)) {
5615	/ set by userspace for anonymous memory only /
5616	swappiness = SWAPPINESS_ANON_ONLY;
5617	} else {
5618	err = kstrtouint(s: swap_string, base: `0`, res: &swappiness);
5619	if (err)
5620	break;
5621	}
5622
5623	err = run_cmd(cmd, memcg_id, nid, seq, sc: &sc, swappiness, opt);
5624	if (err)
5625	break;
5626	}
5627	done:
5628	clear_mm_walk();
5629	blk_finish_plug(&plug);
5630	memalloc_noreclaim_restore(flags);
5631	set_task_reclaim_state(current, NULL);
5632
5633	kvfree(addr: buf);
5634
5635	return err ? : len;
5636	}
5637
5638	static int lru_gen_seq_open(struct inode inode, struct* file *file)
5639	{
5640	return seq_open(file, &lru_gen_seq_ops);
5641	}
5642
5643	static const struct file_operations lru_gen_rw_fops = {
5644	.open = lru_gen_seq_open,
5645	.read = seq_read,
5646	.write = lru_gen_seq_write,
5647	.llseek = seq_lseek,
5648	.release = seq_release,
5649	};
5650
5651	static const struct file_operations lru_gen_ro_fops = {
5652	.open = lru_gen_seq_open,
5653	.read = seq_read,
5654	.llseek = seq_lseek,
5655	.release = seq_release,
5656	};
5657
5658	/******************************************************************************
5659	* initialization
5660	******************************************************************************/
5661
5662	void lru_gen_init_pgdat(struct pglist_data *pgdat)
5663	{
5664	int i, j;
5665
5666	spin_lock_init(&pgdat->memcg_lru.lock);
5667
5668	for (i = `0`; i < MEMCG_NR_GENS; i++) {
5669	for (j = `0`; j < MEMCG_NR_BINS; j++)
5670	INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
5671	}
5672	}
5673
5674	void lru_gen_init_lruvec(struct lruvec *lruvec)
5675	{
5676	int i;
5677	int gen, type, zone;
5678	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5679	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5680
5681	lrugen->max_seq = MIN_NR_GENS + `1`;
5682	lrugen->enabled = lru_gen_enabled();
5683
5684	for (i = `0`; i <= MIN_NR_GENS + `1`; i++)
5685	lrugen->timestamps[i] = jiffies;
5686
5687	for_each_gen_type_zone(gen, type, zone)
5688	INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
5689
5690	if (mm_state)
5691	mm_state->seq = MIN_NR_GENS;
5692	}
5693
5694	#ifdef CONFIG_MEMCG
5695
5696	void lru_gen_init_memcg(struct mem_cgroup *memcg)
5697	{
5698	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
5699
5700	if (!mm_list)
5701	return;
5702
5703	INIT_LIST_HEAD(list: &mm_list->fifo);
5704	spin_lock_init(&mm_list->lock);
5705	}
5706
5707	void lru_gen_exit_memcg(struct mem_cgroup *memcg)
5708	{
5709	int i;
5710	int nid;
5711	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
5712
5713	VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));
5714
5715	for_each_node(nid) {
5716	struct lruvec *lruvec = get_lruvec(memcg, nid);
5717	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5718
5719	VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, `0`,
5720	sizeof(lruvec->lrugen.nr_pages)));
5721
5722	lruvec->lrugen.list.next = LIST_POISON1;
5723
5724	if (!mm_state)
5725	continue;
5726
5727	for (i = `0`; i < NR_BLOOM_FILTERS; i++) {
5728	bitmap_free(bitmap: mm_state->filters[i]);
5729	mm_state->filters[i] = NULL;
5730	}
5731	}
5732	}
5733
5734	#endif /* CONFIG_MEMCG */
5735
5736	static int __init init_lru_gen(void)
5737	{
5738	BUILD_BUG_ON(MIN_NR_GENS + `1` >= MAX_NR_GENS);
5739	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
5740
5741	if (sysfs_create_group(kobj: mm_kobj, grp: &lru_gen_attr_group))
5742	pr_err("lru_gen: failed to create sysfs group\n");
5743
5744	debugfs_create_file_aux_num("lru_gen", `0644`, NULL, NULL, false,
5745	&lru_gen_rw_fops);
5746	debugfs_create_file_aux_num("lru_gen_full", `0444`, NULL, NULL, true,
5747	&lru_gen_ro_fops);
5748
5749	return `0`;
5750	};
5751	late_initcall(init_lru_gen);
5752
5753	#else /* !CONFIG_LRU_GEN */
5754
5755	static void lru_gen_age_node(struct pglist_data pgdat, struct* scan_control *sc)
5756	{
5757	BUILD_BUG();
5758	}
5759
5760	static void lru_gen_shrink_lruvec(struct lruvec lruvec, struct* scan_control *sc)
5761	{
5762	BUILD_BUG();
5763	}
5764
5765	static void lru_gen_shrink_node(struct pglist_data pgdat, struct* scan_control *sc)
5766	{
5767	BUILD_BUG();
5768	}
5769
5770	#endif /* CONFIG_LRU_GEN */
5771
5772	static void shrink_lruvec(struct lruvec lruvec, struct* scan_control *sc)
5773	{
5774	unsigned long nr[NR_LRU_LISTS];
5775	unsigned long targets[NR_LRU_LISTS];
5776	unsigned long nr_to_scan;
5777	enum lru_list lru;
5778	unsigned long nr_reclaimed = `0`;
5779	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
5780	bool proportional_reclaim;
5781	struct blk_plug plug;
5782
5783	if (lru_gen_enabled() && !root_reclaim(sc)) {
5784	lru_gen_shrink_lruvec(lruvec, sc);
5785	return;
5786	}
5787
5788	get_scan_count(lruvec, sc, nr);
5789
5790	/ Record the original scan target for proportional adjustments later /
5791	memcpy(targets, nr, sizeof(nr));
5792
5793	/*
5794	* Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
5795	* event that can occur when there is little memory pressure e.g.
5796	* multiple streaming readers/writers. Hence, we do not abort scanning
5797	* when the requested number of pages are reclaimed when scanning at
5798	* DEF_PRIORITY on the assumption that the fact we are direct
5799	* reclaiming implies that kswapd is not keeping up and it is best to
5800	* do a batch of work at once. For memcg reclaim one check is made to
5801	* abort proportional reclaim if either the file or anon lru has already
5802	* dropped to zero at the first pass.
5803	*/
5804	proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
5805	sc->priority == DEF_PRIORITY);
5806
5807	blk_start_plug(&plug);
5808	while (nr[LRU_INACTIVE_ANON] \|\| nr[LRU_ACTIVE_FILE] \|\|
5809	nr[LRU_INACTIVE_FILE]) {
5810	unsigned long nr_anon, nr_file, percentage;
5811	unsigned long nr_scanned;
5812
5813	for_each_evictable_lru(lru) {
5814	if (nr[lru]) {
5815	nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
5816	nr[lru] -= nr_to_scan;
5817
5818	nr_reclaimed += shrink_list(lru, nr_to_scan,
5819	lruvec, sc);
5820	}
5821	}
5822
5823	cond_resched();
5824
5825	if (nr_reclaimed < nr_to_reclaim \|\| proportional_reclaim)
5826	continue;
5827
5828	/*
5829	* For kswapd and memcg, reclaim at least the number of pages
5830	* requested. Ensure that the anon and file LRUs are scanned
5831	* proportionally what was requested by get_scan_count(). We
5832	* stop reclaiming one LRU and reduce the amount scanning
5833	* proportional to the original scan target.
5834	*/
5835	nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
5836	nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
5837
5838	/*
5839	* It's just vindictive to attack the larger once the smaller
5840	* has gone to zero. And given the way we stop scanning the
5841	* smaller below, this makes sure that we only make one nudge
5842	* towards proportionality once we've got nr_to_reclaim.
5843	*/
5844	if (!nr_file \|\| !nr_anon)
5845	break;
5846
5847	if (nr_file > nr_anon) {
5848	unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
5849	targets[LRU_ACTIVE_ANON] + `1`;
5850	lru = LRU_BASE;
5851	percentage = nr_anon * `100` / scan_target;
5852	} else {
5853	unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
5854	targets[LRU_ACTIVE_FILE] + `1`;
5855	lru = LRU_FILE;
5856	percentage = nr_file * `100` / scan_target;
5857	}
5858
5859	/ Stop scanning the smaller of the LRU /
5860	nr[lru] = `0`;
5861	nr[lru + LRU_ACTIVE] = `0`;
5862
5863	/*
5864	* Recalculate the other LRU scan count based on its original
5865	* scan target and the percentage scanning already complete
5866	*/
5867	lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
5868	nr_scanned = targets[lru] - nr[lru];
5869	nr[lru] = targets[lru] * (`100` - percentage) / `100`;
5870	nr[lru] -= min(nr[lru], nr_scanned);
5871
5872	lru += LRU_ACTIVE;
5873	nr_scanned = targets[lru] - nr[lru];
5874	nr[lru] = targets[lru] * (`100` - percentage) / `100`;
5875	nr[lru] -= min(nr[lru], nr_scanned);
5876	}
5877	blk_finish_plug(&plug);
5878	sc->nr_reclaimed += nr_reclaimed;
5879
5880	/*
5881	* Even if we did not try to evict anon pages at all, we want to
5882	* rebalance the anon lru active/inactive ratio.
5883	*/
5884	if (can_age_anon_pages(lruvec, sc) &&
5885	inactive_is_low(lruvec, inactive_lru: LRU_INACTIVE_ANON))
5886	shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
5887	sc, lru: LRU_ACTIVE_ANON);
5888	}
5889
5890	/ Use reclaim/compaction for costly allocs or under memory pressure /
5891	static bool in_reclaim_compaction(struct scan_control *sc)
5892	{
5893	if (gfp_compaction_allowed(gfp_mask: sc->gfp_mask) && sc->order &&
5894	(sc->order > PAGE_ALLOC_COSTLY_ORDER \|\|
5895	sc->priority < DEF_PRIORITY - `2`))
5896	return true;
5897
5898	return false;
5899	}
5900
5901	/*
5902	* Reclaim/compaction is used for high-order allocation requests. It reclaims
5903	* order-0 pages before compacting the zone. should_continue_reclaim() returns
5904	* true if more pages should be reclaimed such that when the page allocator
5905	* calls try_to_compact_pages() that it will have enough free pages to succeed.
5906	* It will give up earlier than that if there is difficulty reclaiming pages.
5907	*/
5908	static inline bool should_continue_reclaim(struct pglist_data *pgdat,
5909	unsigned long nr_reclaimed,
5910	struct scan_control *sc)
5911	{
5912	unsigned long pages_for_compaction;
5913	unsigned long inactive_lru_pages;
5914	int z;
5915	struct zone *zone;
5916
5917	/ If not in reclaim/compaction mode, stop /
5918	if (!in_reclaim_compaction(sc))
5919	return false;
5920
5921	/*
5922	* Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
5923	* number of pages that were scanned. This will return to the caller
5924	* with the risk reclaim/compaction and the resulting allocation attempt
5925	* fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
5926	* allocations through requiring that the full LRU list has been scanned
5927	* first, by assuming that zero delta of sc->nr_scanned means full LRU
5928	* scan, but that approximation was wrong, and there were corner cases
5929	* where always a non-zero amount of pages were scanned.
5930	*/
5931	if (!nr_reclaimed)
5932	return false;
5933
5934	/ If compaction would go ahead or the allocation would succeed, stop /
5935	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
5936	unsigned long watermark = min_wmark_pages(z: zone);
5937
5938	/ Allocation can already succeed, nothing to do /
5939	if (zone_watermark_ok(z: zone, order: sc->order, mark: watermark,
5940	highest_zoneidx: sc->reclaim_idx, alloc_flags: `0`))
5941	return false;
5942
5943	if (compaction_suitable(zone, order: sc->order, watermark,
5944	highest_zoneidx: sc->reclaim_idx))
5945	return false;
5946	}
5947
5948	/*
5949	* If we have not reclaimed enough pages for compaction and the
5950	* inactive lists are large enough, continue reclaiming
5951	*/
5952	pages_for_compaction = compact_gap(order: sc->order);
5953	inactive_lru_pages = node_page_state(pgdat, item: NR_INACTIVE_FILE);
5954	if (can_reclaim_anon_pages(NULL, nid: pgdat->node_id, sc))
5955	inactive_lru_pages += node_page_state(pgdat, item: NR_INACTIVE_ANON);
5956
5957	return inactive_lru_pages > pages_for_compaction;
5958	}
5959
5960	static void shrink_node_memcgs(pg_data_t pgdat, struct* scan_control *sc)
5961	{
5962	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
5963	struct mem_cgroup_reclaim_cookie reclaim = {
5964	.pgdat = pgdat,
5965	};
5966	struct mem_cgroup_reclaim_cookie *partial = &reclaim;
5967	struct mem_cgroup *memcg;
5968
5969	/*
5970	* In most cases, direct reclaimers can do partial walks
5971	* through the cgroup tree, using an iterator state that
5972	* persists across invocations. This strikes a balance between
5973	* fairness and allocation latency.
5974	*
5975	* For kswapd, reliable forward progress is more important
5976	* than a quick return to idle. Always do full walks.
5977	*/
5978	if (current_is_kswapd() \|\| sc->memcg_full_walk)
5979	partial = NULL;
5980
5981	memcg = mem_cgroup_iter(target_memcg, NULL, partial);
5982	do {
5983	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
5984	unsigned long reclaimed;
5985	unsigned long scanned;
5986
5987	/*
5988	* This loop can become CPU-bound when target memcgs
5989	* aren't eligible for reclaim - either because they
5990	* don't have any reclaimable pages, or because their
5991	* memory is explicitly protected. Avoid soft lockups.
5992	*/
5993	cond_resched();
5994
5995	mem_cgroup_calculate_protection(root: target_memcg, memcg);
5996
5997	if (mem_cgroup_below_min(target: target_memcg, memcg)) {
5998	/*
5999	* Hard protection.
6000	* If there is no reclaimable memory, OOM.
6001	*/
6002	continue;
6003	} else if (mem_cgroup_below_low(target: target_memcg, memcg)) {
6004	/*
6005	* Soft protection.
6006	* Respect the protection only as long as
6007	* there is an unprotected supply
6008	* of reclaimable memory from other cgroups.
6009	*/
6010	if (!sc->memcg_low_reclaim) {
6011	sc->memcg_low_skipped = `1`;
6012	continue;
6013	}
6014	memcg_memory_event(memcg, event: MEMCG_LOW);
6015	}
6016
6017	reclaimed = sc->nr_reclaimed;
6018	scanned = sc->nr_scanned;
6019
6020	shrink_lruvec(lruvec, sc);
6021
6022	shrink_slab(gfp_mask: sc->gfp_mask, nid: pgdat->node_id, memcg,
6023	priority: sc->priority);
6024
6025	/ Record the group's reclaim efficiency /
6026	if (!sc->proactive)
6027	vmpressure(gfp: sc->gfp_mask, memcg, tree: false,
6028	scanned: sc->nr_scanned - scanned,
6029	reclaimed: sc->nr_reclaimed - reclaimed);
6030
6031	/ If partial walks are allowed, bail once goal is reached /
6032	if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) {
6033	mem_cgroup_iter_break(target_memcg, memcg);
6034	break;
6035	}
6036	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
6037	}
6038
6039	static void shrink_node(pg_data_t pgdat, struct* scan_control *sc)
6040	{
6041	unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed;
6042	struct lruvec *target_lruvec;
6043	bool reclaimable = false;
6044
6045	if (lru_gen_enabled() && root_reclaim(sc)) {
6046	memset(&sc->nr, `0`, sizeof(sc->nr));
6047	lru_gen_shrink_node(pgdat, sc);
6048	return;
6049	}
6050
6051	target_lruvec = mem_cgroup_lruvec(memcg: sc->target_mem_cgroup, pgdat);
6052
6053	again:
6054	memset(&sc->nr, `0`, sizeof(sc->nr));
6055
6056	nr_reclaimed = sc->nr_reclaimed;
6057	nr_scanned = sc->nr_scanned;
6058
6059	prepare_scan_control(pgdat, sc);
6060
6061	shrink_node_memcgs(pgdat, sc);
6062
6063	flush_reclaim_state(sc);
6064
6065	nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed;
6066
6067	/ Record the subtree's reclaim efficiency /
6068	if (!sc->proactive)
6069	vmpressure(gfp: sc->gfp_mask, memcg: sc->target_mem_cgroup, tree: true,
6070	scanned: sc->nr_scanned - nr_scanned, reclaimed: nr_node_reclaimed);
6071
6072	if (nr_node_reclaimed)
6073	reclaimable = true;
6074
6075	if (current_is_kswapd()) {
6076	/*
6077	* If reclaim is isolating dirty pages under writeback,
6078	* it implies that the long-lived page allocation rate
6079	* is exceeding the page laundering rate. Either the
6080	* global limits are not being effective at throttling
6081	* processes due to the page distribution throughout
6082	* zones or there is heavy usage of a slow backing
6083	* device. The only option is to throttle from reclaim
6084	* context which is not ideal as there is no guarantee
6085	* the dirtying process is throttled in the same way
6086	* balance_dirty_pages() manages.
6087	*
6088	* Once a node is flagged PGDAT_WRITEBACK, kswapd will
6089	* count the number of pages under pages flagged for
6090	* immediate reclaim and stall if any are encountered
6091	* in the nr_immediate check below.
6092	*/
6093	if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
6094	set_bit(nr: PGDAT_WRITEBACK, addr: &pgdat->flags);
6095
6096	/*
6097	* If kswapd scans pages marked for immediate
6098	* reclaim and under writeback (nr_immediate), it
6099	* implies that pages are cycling through the LRU
6100	* faster than they are written so forcibly stall
6101	* until some pages complete writeback.
6102	*/
6103	if (sc->nr.immediate)
6104	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_WRITEBACK);
6105	}
6106
6107	/*
6108	* Tag a node/memcg as congested if all the dirty pages were marked
6109	* for writeback and immediate reclaim (counted in nr.congested).
6110	*
6111	* Legacy memcg will stall in page writeback so avoid forcibly
6112	* stalling in reclaim_throttle().
6113	*/
6114	if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
6115	if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
6116	set_bit(nr: LRUVEC_CGROUP_CONGESTED, addr: &target_lruvec->flags);
6117
6118	if (current_is_kswapd())
6119	set_bit(nr: LRUVEC_NODE_CONGESTED, addr: &target_lruvec->flags);
6120	}
6121
6122	/*
6123	* Stall direct reclaim for IO completions if the lruvec is
6124	* node is congested. Allow kswapd to continue until it
6125	* starts encountering unqueued dirty pages or cycling through
6126	* the LRU too quickly.
6127	*/
6128	if (!current_is_kswapd() && current_may_throttle() &&
6129	!sc->hibernation_mode &&
6130	(test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) \|\|
6131	test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
6132	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_CONGESTED);
6133
6134	if (should_continue_reclaim(pgdat, nr_reclaimed: nr_node_reclaimed, sc))
6135	goto again;
6136
6137	/*
6138	* Kswapd gives up on balancing particular nodes after too
6139	* many failures to reclaim anything from them and goes to
6140	* sleep. On reclaim progress, reset the failure counter. A
6141	* successful direct reclaim run will revive a dormant kswapd.
6142	*/
6143	if (reclaimable)
6144	atomic_set(v: &pgdat->kswapd_failures, i: `0`);
6145	else if (sc->cache_trim_mode)
6146	sc->cache_trim_mode_failed = `1`;
6147	}
6148
6149	/*
6150	* Returns true if compaction should go ahead for a costly-order request, or
6151	* the allocation would already succeed without compaction. Return false if we
6152	* should reclaim first.
6153	*/
6154	static inline bool compaction_ready(struct zone zone, struct* scan_control *sc)
6155	{
6156	unsigned long watermark;
6157
6158	if (!gfp_compaction_allowed(gfp_mask: sc->gfp_mask))
6159	return false;
6160
6161	/ Allocation can already succeed, nothing to do /
6162	if (zone_watermark_ok(z: zone, order: sc->order, mark: min_wmark_pages(z: zone),
6163	highest_zoneidx: sc->reclaim_idx, alloc_flags: `0`))
6164	return true;
6165
6166	/*
6167	* Direct reclaim usually targets the min watermark, but compaction
6168	* takes time to run and there are potentially other callers using the
6169	* pages just freed. So target a higher buffer to give compaction a
6170	* reasonable chance of completing and allocating the pages.
6171	*
6172	* Note that we won't actually reclaim the whole buffer in one attempt
6173	* as the target watermark in should_continue_reclaim() is lower. But if
6174	* we are already above the high+gap watermark, don't reclaim at all.
6175	*/
6176	watermark = high_wmark_pages(z: zone);
6177	if (compaction_suitable(zone, order: sc->order, watermark, highest_zoneidx: sc->reclaim_idx))
6178	return true;
6179
6180	return false;
6181	}
6182
6183	static void consider_reclaim_throttle(pg_data_t pgdat, struct* scan_control *sc)
6184	{
6185	/*
6186	* If reclaim is making progress greater than 12% efficiency then
6187	* wake all the NOPROGRESS throttled tasks.
6188	*/
6189	if (sc->nr_reclaimed > (sc->nr_scanned >> `3`)) {
6190	wait_queue_head_t *wqh;
6191
6192	wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
6193	if (waitqueue_active(wq_head: wqh))
6194	wake_up(wqh);
6195
6196	return;
6197	}
6198
6199	/*
6200	* Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
6201	* throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
6202	* under writeback and marked for immediate reclaim at the tail of the
6203	* LRU.
6204	*/
6205	if (current_is_kswapd() \|\| cgroup_reclaim(sc))
6206	return;
6207
6208	/ Throttle if making no progress at high prioities. /
6209	if (sc->priority == `1` && !sc->nr_reclaimed)
6210	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_NOPROGRESS);
6211	}
6212
6213	/*
6214	* This is the direct reclaim path, for page-allocating processes. We only
6215	* try to reclaim pages from zones which will satisfy the caller's allocation
6216	* request.
6217	*
6218	* If a zone is deemed to be full of pinned pages then just give it a light
6219	* scan then give up on it.
6220	*/
6221	static void shrink_zones(struct zonelist zonelist, struct* scan_control *sc)
6222	{
6223	struct zoneref *z;
6224	struct zone *zone;
6225	unsigned long nr_soft_reclaimed;
6226	unsigned long nr_soft_scanned;
6227	gfp_t orig_mask;
6228	pg_data_t *last_pgdat = NULL;
6229	pg_data_t *first_pgdat = NULL;
6230
6231	/*
6232	* If the number of buffer_heads in the machine exceeds the maximum
6233	* allowed level, force direct reclaim to scan the highmem zone as
6234	* highmem pages could be pinning lowmem pages storing buffer_heads
6235	*/
6236	orig_mask = sc->gfp_mask;
6237	if (buffer_heads_over_limit) {
6238	sc->gfp_mask \|= __GFP_HIGHMEM;
6239	sc->reclaim_idx = gfp_zone(flags: sc->gfp_mask);
6240	}
6241
6242	for_each_zone_zonelist_nodemask(zone, z, zonelist,
6243	sc->reclaim_idx, sc->nodemask) {
6244	/*
6245	* Take care memory controller reclaiming has small influence
6246	* to global LRU.
6247	*/
6248	if (!cgroup_reclaim(sc)) {
6249	if (!cpuset_zone_allowed(z: zone,
6250	GFP_KERNEL \| __GFP_HARDWALL))
6251	continue;
6252
6253	/*
6254	* If we already have plenty of memory free for
6255	* compaction in this zone, don't free any more.
6256	* Even though compaction is invoked for any
6257	* non-zero order, only frequent costly order
6258	* reclamation is disruptive enough to become a
6259	* noticeable problem, like transparent huge
6260	* page allocations.
6261	*/
6262	if (IS_ENABLED(CONFIG_COMPACTION) &&
6263	sc->order > PAGE_ALLOC_COSTLY_ORDER &&
6264	compaction_ready(zone, sc)) {
6265	sc->compaction_ready = true;
6266	continue;
6267	}
6268
6269	/*
6270	* Shrink each node in the zonelist once. If the
6271	* zonelist is ordered by zone (not the default) then a
6272	* node may be shrunk multiple times but in that case
6273	* the user prefers lower zones being preserved.
6274	*/
6275	if (zone->zone_pgdat == last_pgdat)
6276	continue;
6277
6278	/*
6279	* This steals pages from memory cgroups over softlimit
6280	* and returns the number of reclaimed pages and
6281	* scanned pages. This works for global memory pressure
6282	* and balancing, not for a memcg's limit.
6283	*/
6284	nr_soft_scanned = `0`;
6285	nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat: zone->zone_pgdat,
6286	order: sc->order, gfp_mask: sc->gfp_mask,
6287	total_scanned: &nr_soft_scanned);
6288	sc->nr_reclaimed += nr_soft_reclaimed;
6289	sc->nr_scanned += nr_soft_scanned;
6290	/ need some check for avoid more shrink_zone() /
6291	}
6292
6293	if (!first_pgdat)
6294	first_pgdat = zone->zone_pgdat;
6295
6296	/ See comment about same check for global reclaim above /
6297	if (zone->zone_pgdat == last_pgdat)
6298	continue;
6299	last_pgdat = zone->zone_pgdat;
6300	shrink_node(pgdat: zone->zone_pgdat, sc);
6301	}
6302
6303	if (first_pgdat)
6304	consider_reclaim_throttle(pgdat: first_pgdat, sc);
6305
6306	/*
6307	* Restore to original mask to avoid the impact on the caller if we
6308	* promoted it to __GFP_HIGHMEM.
6309	*/
6310	sc->gfp_mask = orig_mask;
6311	}
6312
6313	static void snapshot_refaults(struct mem_cgroup target_memcg, pg_data_t pgdat)
6314	{
6315	struct lruvec *target_lruvec;
6316	unsigned long refaults;
6317
6318	if (lru_gen_enabled())
6319	return;
6320
6321	target_lruvec = mem_cgroup_lruvec(memcg: target_memcg, pgdat);
6322	refaults = lruvec_page_state(lruvec: target_lruvec, idx: WORKINGSET_ACTIVATE_ANON);
6323	target_lruvec->refaults[WORKINGSET_ANON] = refaults;
6324	refaults = lruvec_page_state(lruvec: target_lruvec, idx: WORKINGSET_ACTIVATE_FILE);
6325	target_lruvec->refaults[WORKINGSET_FILE] = refaults;
6326	}
6327
6328	/*
6329	* This is the main entry point to direct page reclaim.
6330	*
6331	* If a full scan of the inactive list fails to free enough memory then we
6332	* are "out of memory" and something needs to be killed.
6333	*
6334	* If the caller is !__GFP_FS then the probability of a failure is reasonably
6335	* high - the zone may be full of dirty or under-writeback pages, which this
6336	* caller can't do much about. We kick the writeback threads and take explicit
6337	* naps in the hope that some of these pages can be written. But if the
6338	* allocating task holds filesystem locks which prevent writeout this might not
6339	* work, and the allocation attempt will fail.
6340	*
6341	* returns: 0, if no pages reclaimed
6342	* else, the number of pages reclaimed
6343	*/
6344	static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
6345	struct scan_control *sc)
6346	{
6347	int initial_priority = sc->priority;
6348	pg_data_t *last_pgdat;
6349	struct zoneref *z;
6350	struct zone *zone;
6351	retry:
6352	delayacct_freepages_start();
6353
6354	if (!cgroup_reclaim(sc))
6355	__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, `1`);
6356
6357	do {
6358	if (!sc->proactive)
6359	vmpressure_prio(gfp: sc->gfp_mask, memcg: sc->target_mem_cgroup,
6360	prio: sc->priority);
6361	sc->nr_scanned = `0`;
6362	shrink_zones(zonelist, sc);
6363
6364	if (sc->nr_reclaimed >= sc->nr_to_reclaim)
6365	break;
6366
6367	if (sc->compaction_ready)
6368	break;
6369
6370	/*
6371	* If we're getting trouble reclaiming, start doing
6372	* writepage even in laptop mode.
6373	*/
6374	if (sc->priority < DEF_PRIORITY - `2`)
6375	sc->may_writepage = `1`;
6376	} while (--sc->priority >= `0`);
6377
6378	last_pgdat = NULL;
6379	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
6380	sc->nodemask) {
6381	if (zone->zone_pgdat == last_pgdat)
6382	continue;
6383	last_pgdat = zone->zone_pgdat;
6384
6385	snapshot_refaults(target_memcg: sc->target_mem_cgroup, pgdat: zone->zone_pgdat);
6386
6387	if (cgroup_reclaim(sc)) {
6388	struct lruvec *lruvec;
6389
6390	lruvec = mem_cgroup_lruvec(memcg: sc->target_mem_cgroup,
6391	pgdat: zone->zone_pgdat);
6392	clear_bit(nr: LRUVEC_CGROUP_CONGESTED, addr: &lruvec->flags);
6393	}
6394	}
6395
6396	delayacct_freepages_end();
6397
6398	if (sc->nr_reclaimed)
6399	return sc->nr_reclaimed;
6400
6401	/ Aborted reclaim to try compaction? don't OOM, then /
6402	if (sc->compaction_ready)
6403	return `1`;
6404
6405	/*
6406	* In most cases, direct reclaimers can do partial walks
6407	* through the cgroup tree to meet the reclaim goal while
6408	* keeping latency low. Since the iterator state is shared
6409	* among all direct reclaim invocations (to retain fairness
6410	* among cgroups), though, high concurrency can result in
6411	* individual threads not seeing enough cgroups to make
6412	* meaningful forward progress. Avoid false OOMs in this case.
6413	*/
6414	if (!sc->memcg_full_walk) {
6415	sc->priority = initial_priority;
6416	sc->memcg_full_walk = `1`;
6417	goto retry;
6418	}
6419
6420	/*
6421	* We make inactive:active ratio decisions based on the node's
6422	* composition of memory, but a restrictive reclaim_idx or a
6423	* memory.low cgroup setting can exempt large amounts of
6424	* memory from reclaim. Neither of which are very common, so
6425	* instead of doing costly eligibility calculations of the
6426	* entire cgroup subtree up front, we assume the estimates are
6427	* good, and retry with forcible deactivation if that fails.
6428	*/
6429	if (sc->skipped_deactivate) {
6430	sc->priority = initial_priority;
6431	sc->force_deactivate = `1`;
6432	sc->skipped_deactivate = `0`;
6433	goto retry;
6434	}
6435
6436	/ Untapped cgroup reserves? Don't OOM, retry. /
6437	if (sc->memcg_low_skipped) {
6438	sc->priority = initial_priority;
6439	sc->force_deactivate = `0`;
6440	sc->memcg_low_reclaim = `1`;
6441	sc->memcg_low_skipped = `0`;
6442	goto retry;
6443	}
6444
6445	return `0`;
6446	}
6447
6448	static bool allow_direct_reclaim(pg_data_t *pgdat)
6449	{
6450	struct zone *zone;
6451	unsigned long pfmemalloc_reserve = `0`;
6452	unsigned long free_pages = `0`;
6453	int i;
6454	bool wmark_ok;
6455
6456	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
6457	return true;
6458
6459	for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
6460	if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, item: NR_FREE_PAGES))
6461	continue;
6462
6463	pfmemalloc_reserve += min_wmark_pages(z: zone);
6464	free_pages += zone_page_state_snapshot(zone, item: NR_FREE_PAGES);
6465	}
6466
6467	/ If there are no reserves (unexpected config) then do not throttle /
6468	if (!pfmemalloc_reserve)
6469	return true;
6470
6471	wmark_ok = free_pages > pfmemalloc_reserve / `2`;
6472
6473	/ kswapd must be awake if processes are being throttled /
6474	if (!wmark_ok && waitqueue_active(wq_head: &pgdat->kswapd_wait)) {
6475	if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
6476	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
6477
6478	wake_up_interruptible(&pgdat->kswapd_wait);
6479	}
6480
6481	return wmark_ok;
6482	}
6483
6484	/*
6485	* Throttle direct reclaimers if backing storage is backed by the network
6486	* and the PFMEMALLOC reserve for the preferred node is getting dangerously
6487	* depleted. kswapd will continue to make progress and wake the processes
6488	* when the low watermark is reached.
6489	*
6490	* Returns true if a fatal signal was delivered during throttling. If this
6491	* happens, the page allocator should not consider triggering the OOM killer.
6492	*/
6493	static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
6494	nodemask_t *nodemask)
6495	{
6496	struct zoneref *z;
6497	struct zone *zone;
6498	pg_data_t *pgdat = NULL;
6499
6500	/*
6501	* Kernel threads should not be throttled as they may be indirectly
6502	* responsible for cleaning pages necessary for reclaim to make forward
6503	* progress. kjournald for example may enter direct reclaim while
6504	* committing a transaction where throttling it could forcing other
6505	* processes to block on log_wait_commit().
6506	*/
6507	if (current->flags & PF_KTHREAD)
6508	goto out;
6509
6510	/*
6511	* If a fatal signal is pending, this process should not throttle.
6512	* It should return quickly so it can exit and free its memory
6513	*/
6514	if (fatal_signal_pending(current))
6515	goto out;
6516
6517	/*
6518	* Check if the pfmemalloc reserves are ok by finding the first node
6519	* with a usable ZONE_NORMAL or lower zone. The expectation is that
6520	* GFP_KERNEL will be required for allocating network buffers when
6521	* swapping over the network so ZONE_HIGHMEM is unusable.
6522	*
6523	* Throttling is based on the first usable node and throttled processes
6524	* wait on a queue until kswapd makes progress and wakes them. There
6525	* is an affinity then between processes waking up and where reclaim
6526	* progress has been made assuming the process wakes on the same node.
6527	* More importantly, processes running on remote nodes will not compete
6528	* for remote pfmemalloc reserves and processes on different nodes
6529	* should make reasonable progress.
6530	*/
6531	for_each_zone_zonelist_nodemask(zone, z, zonelist,
6532	gfp_zone(gfp_mask), nodemask) {
6533	if (zone_idx(zone) > ZONE_NORMAL)
6534	continue;
6535
6536	/ Throttle based on the first usable node /
6537	pgdat = zone->zone_pgdat;
6538	if (allow_direct_reclaim(pgdat))
6539	goto out;
6540	break;
6541	}
6542
6543	/ If no zone was usable by the allocation flags then do not throttle /
6544	if (!pgdat)
6545	goto out;
6546
6547	/ Account for the throttling /
6548	count_vm_event(item: PGSCAN_DIRECT_THROTTLE);
6549
6550	/*
6551	* If the caller cannot enter the filesystem, it's possible that it
6552	* is due to the caller holding an FS lock or performing a journal
6553	* transaction in the case of a filesystem like ext[3\|4]. In this case,
6554	* it is not safe to block on pfmemalloc_wait as kswapd could be
6555	* blocked waiting on the same lock. Instead, throttle for up to a
6556	* second before continuing.
6557	*/
6558	if (!(gfp_mask & __GFP_FS))
6559	wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
6560	allow_direct_reclaim(pgdat), HZ);
6561	else
6562	/ Throttle until kswapd wakes the process /
6563	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
6564	allow_direct_reclaim(pgdat));
6565
6566	if (fatal_signal_pending(current))
6567	return true;
6568
6569	out:
6570	return false;
6571	}
6572
6573	unsigned long try_to_free_pages(struct zonelist zonelist, int* order,
6574	gfp_t gfp_mask, nodemask_t *nodemask)
6575	{
6576	unsigned long nr_reclaimed;
6577	struct scan_control sc = {
6578	.nr_to_reclaim = SWAP_CLUSTER_MAX,
6579	.gfp_mask = current_gfp_context(flags: gfp_mask),
6580	.reclaim_idx = gfp_zone(flags: gfp_mask),
6581	.order = order,
6582	.nodemask = nodemask,
6583	.priority = DEF_PRIORITY,
6584	.may_writepage = !laptop_mode,
6585	.may_unmap = `1`,
6586	.may_swap = `1`,
6587	};
6588
6589	/*
6590	* scan_control uses s8 fields for order, priority, and reclaim_idx.
6591	* Confirm they are large enough for max values.
6592	*/
6593	BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
6594	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
6595	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
6596
6597	/*
6598	* Do not enter reclaim if fatal signal was delivered while throttled.
6599	* 1 is returned so that the page allocator does not OOM kill at this
6600	* point.
6601	*/
6602	if (throttle_direct_reclaim(gfp_mask: sc.gfp_mask, zonelist, nodemask))
6603	return `1`;
6604
6605	set_task_reclaim_state(current, rs: &sc.reclaim_state);
6606	trace_mm_vmscan_direct_reclaim_begin(order, gfp_flags: sc.gfp_mask);
6607
6608	nr_reclaimed = do_try_to_free_pages(zonelist, sc: &sc);
6609
6610	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
6611	set_task_reclaim_state(current, NULL);
6612
6613	return nr_reclaimed;
6614	}
6615
6616	#ifdef CONFIG_MEMCG
6617
6618	/ Only used by soft limit reclaim. Do not reuse for anything else. /
6619	unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
6620	gfp_t gfp_mask, bool noswap,
6621	pg_data_t *pgdat,
6622	unsigned long *nr_scanned)
6623	{
6624	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
6625	struct scan_control sc = {
6626	.nr_to_reclaim = SWAP_CLUSTER_MAX,
6627	.target_mem_cgroup = memcg,
6628	.may_writepage = !laptop_mode,
6629	.may_unmap = `1`,
6630	.reclaim_idx = MAX_NR_ZONES - `1`,
6631	.may_swap = !noswap,
6632	};
6633
6634	WARN_ON_ONCE(!current->reclaim_state);
6635
6636	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) \|
6637	(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
6638
6639	trace_mm_vmscan_memcg_softlimit_reclaim_begin(order: sc.order,
6640	gfp_flags: sc.gfp_mask);
6641
6642	/*
6643	* NOTE: Although we can get the priority field, using it
6644	* here is not a good idea, since it limits the pages we can scan.
6645	* if we don't reclaim here, the shrink_node from balance_pgdat
6646	* will pick up pages from other mem cgroup's as well. We hack
6647	* the priority and make it zero.
6648	*/
6649	shrink_lruvec(lruvec, sc: &sc);
6650
6651	trace_mm_vmscan_memcg_softlimit_reclaim_end(nr_reclaimed: sc.nr_reclaimed);
6652
6653	*nr_scanned = sc.nr_scanned;
6654
6655	return sc.nr_reclaimed;
6656	}
6657
6658	unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
6659	unsigned long nr_pages,
6660	gfp_t gfp_mask,
6661	unsigned int reclaim_options,
6662	int *swappiness)
6663	{
6664	unsigned long nr_reclaimed;
6665	unsigned int noreclaim_flag;
6666	struct scan_control sc = {
6667	.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
6668	.proactive_swappiness = swappiness,
6669	.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) \|
6670	(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
6671	.reclaim_idx = MAX_NR_ZONES - `1`,
6672	.target_mem_cgroup = memcg,
6673	.priority = DEF_PRIORITY,
6674	.may_writepage = !laptop_mode,
6675	.may_unmap = `1`,
6676	.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
6677	.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
6678	};
6679	/*
6680	* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
6681	* equal pressure on all the nodes. This is based on the assumption that
6682	* the reclaim does not bail out early.
6683	*/
6684	struct zonelist *zonelist = node_zonelist(nid: numa_node_id(), flags: sc.gfp_mask);
6685
6686	set_task_reclaim_state(current, rs: &sc.reclaim_state);
6687	trace_mm_vmscan_memcg_reclaim_begin(order: `0`, gfp_flags: sc.gfp_mask);
6688	noreclaim_flag = memalloc_noreclaim_save();
6689
6690	nr_reclaimed = do_try_to_free_pages(zonelist, sc: &sc);
6691
6692	memalloc_noreclaim_restore(flags: noreclaim_flag);
6693	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
6694	set_task_reclaim_state(current, NULL);
6695
6696	return nr_reclaimed;
6697	}
6698	#else
6699	unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
6700	unsigned long nr_pages,
6701	gfp_t gfp_mask,
6702	unsigned int reclaim_options,
6703	int *swappiness)
6704	{
6705	return `0`;
6706	}
6707	#endif
6708
6709	static void kswapd_age_node(struct pglist_data pgdat, struct* scan_control *sc)
6710	{
6711	struct mem_cgroup *memcg;
6712	struct lruvec *lruvec;
6713
6714	if (lru_gen_enabled()) {
6715	lru_gen_age_node(pgdat, sc);
6716	return;
6717	}
6718
6719	lruvec = mem_cgroup_lruvec(NULL, pgdat);
6720	if (!can_age_anon_pages(lruvec, sc))
6721	return;
6722
6723	if (!inactive_is_low(lruvec, inactive_lru: LRU_INACTIVE_ANON))
6724	return;
6725
6726	memcg = mem_cgroup_iter(NULL, NULL, NULL);
6727	do {
6728	lruvec = mem_cgroup_lruvec(memcg, pgdat);
6729	shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
6730	sc, lru: LRU_ACTIVE_ANON);
6731	memcg = mem_cgroup_iter(NULL, memcg, NULL);
6732	} while (memcg);
6733	}
6734
6735	static bool pgdat_watermark_boosted(pg_data_t pgdat, int* highest_zoneidx)
6736	{
6737	int i;
6738	struct zone *zone;
6739
6740	/*
6741	* Check for watermark boosts top-down as the higher zones
6742	* are more likely to be boosted. Both watermarks and boosts
6743	* should not be checked at the same time as reclaim would
6744	* start prematurely when there is no boosting and a lower
6745	* zone is balanced.
6746	*/
6747	for (i = highest_zoneidx; i >= `0`; i--) {
6748	zone = pgdat->node_zones + i;
6749	if (!managed_zone(zone))
6750	continue;
6751
6752	if (zone->watermark_boost)
6753	return true;
6754	}
6755
6756	return false;
6757	}
6758
6759	/*
6760	* Returns true if there is an eligible zone balanced for the request order
6761	* and highest_zoneidx
6762	*/
6763	static bool pgdat_balanced(pg_data_t pgdat, int* order, int highest_zoneidx)
6764	{
6765	int i;
6766	unsigned long mark = -`1`;
6767	struct zone *zone;
6768
6769	/*
6770	* Check watermarks bottom-up as lower zones are more likely to
6771	* meet watermarks.
6772	*/
6773	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
6774	enum zone_stat_item item;
6775	unsigned long free_pages;
6776
6777	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
6778	mark = promo_wmark_pages(z: zone);
6779	else
6780	mark = high_wmark_pages(z: zone);
6781
6782	/*
6783	* In defrag_mode, watermarks must be met in whole
6784	* blocks to avoid polluting allocator fallbacks.
6785	*
6786	* However, kswapd usually cannot accomplish this on
6787	* its own and needs kcompactd support. Once it's
6788	* reclaimed a compaction gap, and kswapd_shrink_node
6789	* has dropped order, simply ensure there are enough
6790	* base pages for compaction, wake kcompactd & sleep.
6791	*/
6792	if (defrag_mode && order)
6793	item = NR_FREE_PAGES_BLOCKS;
6794	else
6795	item = NR_FREE_PAGES;
6796
6797	/*
6798	* When there is a high number of CPUs in the system,
6799	* the cumulative error from the vmstat per-cpu cache
6800	* can blur the line between the watermarks. In that
6801	* case, be safe and get an accurate snapshot.
6802	*
6803	* TODO: NR_FREE_PAGES_BLOCKS moves in steps of
6804	* pageblock_nr_pages, while the vmstat pcp threshold
6805	* is limited to 125. On many configurations that
6806	* counter won't actually be per-cpu cached. But keep
6807	* things simple for now; revisit when somebody cares.
6808	*/
6809	free_pages = zone_page_state(zone, item);
6810	if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
6811	free_pages = zone_page_state_snapshot(zone, item);
6812
6813	if (__zone_watermark_ok(z: zone, order, mark, highest_zoneidx,
6814	alloc_flags: `0`, free_pages))
6815	return true;
6816	}
6817
6818	/*
6819	* If a node has no managed zone within highest_zoneidx, it does not
6820	* need balancing by definition. This can happen if a zone-restricted
6821	* allocation tries to wake a remote kswapd.
6822	*/
6823	if (mark == -`1`)
6824	return true;
6825
6826	return false;
6827	}
6828
6829	/ Clear pgdat state for congested, dirty or under writeback. /
6830	static void clear_pgdat_congested(pg_data_t *pgdat)
6831	{
6832	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
6833
6834	clear_bit(nr: LRUVEC_NODE_CONGESTED, addr: &lruvec->flags);
6835	clear_bit(nr: LRUVEC_CGROUP_CONGESTED, addr: &lruvec->flags);
6836	clear_bit(nr: PGDAT_WRITEBACK, addr: &pgdat->flags);
6837	}
6838
6839	/*
6840	* Prepare kswapd for sleeping. This verifies that there are no processes
6841	* waiting in throttle_direct_reclaim() and that watermarks have been met.
6842	*
6843	* Returns true if kswapd is ready to sleep
6844	*/
6845	static bool prepare_kswapd_sleep(pg_data_t pgdat, int* order,
6846	int highest_zoneidx)
6847	{
6848	/*
6849	* The throttled processes are normally woken up in balance_pgdat() as
6850	* soon as allow_direct_reclaim() is true. But there is a potential
6851	* race between when kswapd checks the watermarks and a process gets
6852	* throttled. There is also a potential race if processes get
6853	* throttled, kswapd wakes, a large process exits thereby balancing the
6854	* zones, which causes kswapd to exit balance_pgdat() before reaching
6855	* the wake up checks. If kswapd is going to sleep, no process should
6856	* be sleeping on pfmemalloc_wait, so wake them now if necessary. If
6857	* the wake up is premature, processes will wake kswapd and get
6858	* throttled again. The difference from wake ups in balance_pgdat() is
6859	* that here we are under prepare_to_wait().
6860	*/
6861	if (waitqueue_active(wq_head: &pgdat->pfmemalloc_wait))
6862	wake_up_all(&pgdat->pfmemalloc_wait);
6863
6864	/ Hopeless node, leave it to direct reclaim /
6865	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
6866	return true;
6867
6868	if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
6869	clear_pgdat_congested(pgdat);
6870	return true;
6871	}
6872
6873	return false;
6874	}
6875
6876	/*
6877	* kswapd shrinks a node of pages that are at or below the highest usable
6878	* zone that is currently unbalanced.
6879	*
6880	* Returns true if kswapd scanned at least the requested number of pages to
6881	* reclaim or if the lack of progress was due to pages under writeback.
6882	* This is used to determine if the scanning priority needs to be raised.
6883	*/
6884	static bool kswapd_shrink_node(pg_data_t *pgdat,
6885	struct scan_control *sc)
6886	{
6887	struct zone *zone;
6888	int z;
6889	unsigned long nr_reclaimed = sc->nr_reclaimed;
6890
6891	/ Reclaim a number of pages proportional to the number of zones /
6892	sc->nr_to_reclaim = `0`;
6893	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
6894	sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
6895	}
6896
6897	/*
6898	* Historically care was taken to put equal pressure on all zones but
6899	* now pressure is applied based on node LRU order.
6900	*/
6901	shrink_node(pgdat, sc);
6902
6903	/*
6904	* Fragmentation may mean that the system cannot be rebalanced for
6905	* high-order allocations. If twice the allocation size has been
6906	* reclaimed then recheck watermarks only at order-0 to prevent
6907	* excessive reclaim. Assume that a process requested a high-order
6908	* can direct reclaim/compact.
6909	*/
6910	if (sc->order && sc->nr_reclaimed >= compact_gap(order: sc->order))
6911	sc->order = `0`;
6912
6913	/ account for progress from mm_account_reclaimed_pages() /
6914	return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
6915	}
6916
6917	/ Page allocator PCP high watermark is lowered if reclaim is active. /
6918	static inline void
6919	update_reclaim_active(pg_data_t pgdat, int* highest_zoneidx, bool active)
6920	{
6921	int i;
6922	struct zone *zone;
6923
6924	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
6925	if (active)
6926	set_bit(nr: ZONE_RECLAIM_ACTIVE, addr: &zone->flags);
6927	else
6928	clear_bit(nr: ZONE_RECLAIM_ACTIVE, addr: &zone->flags);
6929	}
6930	}
6931
6932	static inline void
6933	set_reclaim_active(pg_data_t pgdat, int* highest_zoneidx)
6934	{
6935	update_reclaim_active(pgdat, highest_zoneidx, active: true);
6936	}
6937
6938	static inline void
6939	clear_reclaim_active(pg_data_t pgdat, int* highest_zoneidx)
6940	{
6941	update_reclaim_active(pgdat, highest_zoneidx, active: false);
6942	}
6943
6944	/*
6945	* For kswapd, balance_pgdat() will reclaim pages across a node from zones
6946	* that are eligible for use by the caller until at least one zone is
6947	* balanced.
6948	*
6949	* Returns the order kswapd finished reclaiming at.
6950	*
6951	* kswapd scans the zones in the highmem->normal->dma direction. It skips
6952	* zones which have free_pages > high_wmark_pages(zone), but once a zone is
6953	* found to have free_pages <= high_wmark_pages(zone), any page in that zone
6954	* or lower is eligible for reclaim until at least one usable zone is
6955	* balanced.
6956	*/
6957	static int balance_pgdat(pg_data_t pgdat, int* order, int highest_zoneidx)
6958	{
6959	int i;
6960	unsigned long nr_soft_reclaimed;
6961	unsigned long nr_soft_scanned;
6962	unsigned long pflags;
6963	unsigned long nr_boost_reclaim;
6964	unsigned long zone_boosts[MAX_NR_ZONES] = { `0`, };
6965	bool boosted;
6966	struct zone *zone;
6967	struct scan_control sc = {
6968	.gfp_mask = GFP_KERNEL,
6969	.order = order,
6970	.may_unmap = `1`,
6971	};
6972
6973	set_task_reclaim_state(current, rs: &sc.reclaim_state);
6974	psi_memstall_enter(flags: &pflags);
6975	__fs_reclaim_acquire(_THIS_IP_);
6976
6977	count_vm_event(item: PAGEOUTRUN);
6978
6979	/*
6980	* Account for the reclaim boost. Note that the zone boost is left in
6981	* place so that parallel allocations that are near the watermark will
6982	* stall or direct reclaim until kswapd is finished.
6983	*/
6984	nr_boost_reclaim = `0`;
6985	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
6986	nr_boost_reclaim += zone->watermark_boost;
6987	zone_boosts[i] = zone->watermark_boost;
6988	}
6989	boosted = nr_boost_reclaim;
6990
6991	restart:
6992	set_reclaim_active(pgdat, highest_zoneidx);
6993	sc.priority = DEF_PRIORITY;
6994	do {
6995	unsigned long nr_reclaimed = sc.nr_reclaimed;
6996	bool raise_priority = true;
6997	bool balanced;
6998	bool ret;
6999	bool was_frozen;
7000
7001	sc.reclaim_idx = highest_zoneidx;
7002
7003	/*
7004	* If the number of buffer_heads exceeds the maximum allowed
7005	* then consider reclaiming from all zones. This has a dual
7006	* purpose -- on 64-bit systems it is expected that
7007	* buffer_heads are stripped during active rotation. On 32-bit
7008	* systems, highmem pages can pin lowmem memory and shrinking
7009	* buffers can relieve lowmem pressure. Reclaim may still not
7010	* go ahead if all eligible zones for the original allocation
7011	* request are balanced to avoid excessive reclaim from kswapd.
7012	*/
7013	if (buffer_heads_over_limit) {
7014	for (i = MAX_NR_ZONES - `1`; i >= `0`; i--) {
7015	zone = pgdat->node_zones + i;
7016	if (!managed_zone(zone))
7017	continue;
7018
7019	sc.reclaim_idx = i;
7020	break;
7021	}
7022	}
7023
7024	/*
7025	* If the pgdat is imbalanced then ignore boosting and preserve
7026	* the watermarks for a later time and restart. Note that the
7027	* zone watermarks will be still reset at the end of balancing
7028	* on the grounds that the normal reclaim should be enough to
7029	* re-evaluate if boosting is required when kswapd next wakes.
7030	*/
7031	balanced = pgdat_balanced(pgdat, order: sc.order, highest_zoneidx);
7032	if (!balanced && nr_boost_reclaim) {
7033	nr_boost_reclaim = `0`;
7034	goto restart;
7035	}
7036
7037	/*
7038	* If boosting is not active then only reclaim if there are no
7039	* eligible zones. Note that sc.reclaim_idx is not used as
7040	* buffer_heads_over_limit may have adjusted it.
7041	*/
7042	if (!nr_boost_reclaim && balanced)
7043	goto out;
7044
7045	/ Limit the priority of boosting to avoid reclaim writeback /
7046	if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - `2`)
7047	raise_priority = false;
7048
7049	/*
7050	* Do not writeback or swap pages for boosted reclaim. The
7051	* intent is to relieve pressure not issue sub-optimal IO
7052	* from reclaim context. If no pages are reclaimed, the
7053	* reclaim will be aborted.
7054	*/
7055	sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
7056	sc.may_swap = !nr_boost_reclaim;
7057
7058	/*
7059	* Do some background aging, to give pages a chance to be
7060	* referenced before reclaiming. All pages are rotated
7061	* regardless of classzone as this is about consistent aging.
7062	*/
7063	kswapd_age_node(pgdat, sc: &sc);
7064
7065	/*
7066	* If we're getting trouble reclaiming, start doing writepage
7067	* even in laptop mode.
7068	*/
7069	if (sc.priority < DEF_PRIORITY - `2`)
7070	sc.may_writepage = `1`;
7071
7072	/ Call soft limit reclaim before calling shrink_node. /
7073	sc.nr_scanned = `0`;
7074	nr_soft_scanned = `0`;
7075	nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, order: sc.order,
7076	gfp_mask: sc.gfp_mask, total_scanned: &nr_soft_scanned);
7077	sc.nr_reclaimed += nr_soft_reclaimed;
7078
7079	/*
7080	* There should be no need to raise the scanning priority if
7081	* enough pages are already being scanned that that high
7082	* watermark would be met at 100% efficiency.
7083	*/
7084	if (kswapd_shrink_node(pgdat, sc: &sc))
7085	raise_priority = false;
7086
7087	/*
7088	* If the low watermark is met there is no need for processes
7089	* to be throttled on pfmemalloc_wait as they should not be
7090	* able to safely make forward progress. Wake them
7091	*/
7092	if (waitqueue_active(wq_head: &pgdat->pfmemalloc_wait) &&
7093	allow_direct_reclaim(pgdat))
7094	wake_up_all(&pgdat->pfmemalloc_wait);
7095
7096	/ Check if kswapd should be suspending /
7097	__fs_reclaim_release(_THIS_IP_);
7098	ret = kthread_freezable_should_stop(was_frozen: &was_frozen);
7099	__fs_reclaim_acquire(_THIS_IP_);
7100	if (was_frozen \|\| ret)
7101	break;
7102
7103	/*
7104	* Raise priority if scanning rate is too low or there was no
7105	* progress in reclaiming pages
7106	*/
7107	nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
7108	nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
7109
7110	/*
7111	* If reclaim made no progress for a boost, stop reclaim as
7112	* IO cannot be queued and it could be an infinite loop in
7113	* extreme circumstances.
7114	*/
7115	if (nr_boost_reclaim && !nr_reclaimed)
7116	break;
7117
7118	if (raise_priority \|\| !nr_reclaimed)
7119	sc.priority--;
7120	} while (sc.priority >= `1`);
7121
7122	/*
7123	* Restart only if it went through the priority loop all the way,
7124	* but cache_trim_mode didn't work.
7125	*/
7126	if (!sc.nr_reclaimed && sc.priority < `1` &&
7127	!sc.no_cache_trim_mode && sc.cache_trim_mode_failed) {
7128	sc.no_cache_trim_mode = `1`;
7129	goto restart;
7130	}
7131
7132	/*
7133	* If the reclaim was boosted, we might still be far from the
7134	* watermark_high at this point. We need to avoid increasing the
7135	* failure count to prevent the kswapd thread from stopping.
7136	*/
7137	if (!sc.nr_reclaimed && !boosted)
7138	atomic_inc(v: &pgdat->kswapd_failures);
7139
7140	out:
7141	clear_reclaim_active(pgdat, highest_zoneidx);
7142
7143	/ If reclaim was boosted, account for the reclaim done in this pass /
7144	if (boosted) {
7145	unsigned long flags;
7146
7147	for (i = `0`; i <= highest_zoneidx; i++) {
7148	if (!zone_boosts[i])
7149	continue;
7150
7151	/ Increments are under the zone lock /
7152	zone = pgdat->node_zones + i;
7153	spin_lock_irqsave(&zone->lock, flags);
7154	zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
7155	spin_unlock_irqrestore(lock: &zone->lock, flags);
7156	}
7157
7158	/*
7159	* As there is now likely space, wakeup kcompact to defragment
7160	* pageblocks.
7161	*/
7162	wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
7163	}
7164
7165	snapshot_refaults(NULL, pgdat);
7166	__fs_reclaim_release(_THIS_IP_);
7167	psi_memstall_leave(flags: &pflags);
7168	set_task_reclaim_state(current, NULL);
7169
7170	/*
7171	* Return the order kswapd stopped reclaiming at as
7172	* prepare_kswapd_sleep() takes it into account. If another caller
7173	* entered the allocator slow path while kswapd was awake, order will
7174	* remain at the higher level.
7175	*/
7176	return sc.order;
7177	}
7178
7179	/*
7180	* The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
7181	* be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
7182	* not a valid index then either kswapd runs for first time or kswapd couldn't
7183	* sleep after previous reclaim attempt (node is still unbalanced). In that
7184	* case return the zone index of the previous kswapd reclaim cycle.
7185	*/
7186	static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
7187	enum zone_type prev_highest_zoneidx)
7188	{
7189	enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7190
7191	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
7192	}
7193
7194	static void kswapd_try_to_sleep(pg_data_t pgdat, int* alloc_order, int reclaim_order,
7195	unsigned int highest_zoneidx)
7196	{
7197	long remaining = `0`;
7198	DEFINE_WAIT(wait);
7199
7200	if (freezing(current) \|\| kthread_should_stop())
7201	return;
7202
7203	prepare_to_wait(wq_head: &pgdat->kswapd_wait, wq_entry: &wait, TASK_INTERRUPTIBLE);
7204
7205	/*
7206	* Try to sleep for a short interval. Note that kcompactd will only be
7207	* woken if it is possible to sleep for a short interval. This is
7208	* deliberate on the assumption that if reclaim cannot keep an
7209	* eligible zone balanced that it's also unlikely that compaction will
7210	* succeed.
7211	*/
7212	if (prepare_kswapd_sleep(pgdat, order: reclaim_order, highest_zoneidx)) {
7213	/*
7214	* Compaction records what page blocks it recently failed to
7215	* isolate pages from and skips them in the future scanning.
7216	* When kswapd is going to sleep, it is reasonable to assume
7217	* that pages and compaction may succeed so reset the cache.
7218	*/
7219	reset_isolation_suitable(pgdat);
7220
7221	/*
7222	* We have freed the memory, now we should compact it to make
7223	* allocation of the requested order possible.
7224	*/
7225	wakeup_kcompactd(pgdat, order: alloc_order, highest_zoneidx);
7226
7227	remaining = schedule_timeout(HZ/`10`);
7228
7229	/*
7230	* If woken prematurely then reset kswapd_highest_zoneidx and
7231	* order. The values will either be from a wakeup request or
7232	* the previous request that slept prematurely.
7233	*/
7234	if (remaining) {
7235	WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
7236	kswapd_highest_zoneidx(pgdat,
7237	highest_zoneidx));
7238
7239	if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
7240	WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
7241	}
7242
7243	finish_wait(wq_head: &pgdat->kswapd_wait, wq_entry: &wait);
7244	prepare_to_wait(wq_head: &pgdat->kswapd_wait, wq_entry: &wait, TASK_INTERRUPTIBLE);
7245	}
7246
7247	/*
7248	* After a short sleep, check if it was a premature sleep. If not, then
7249	* go fully to sleep until explicitly woken up.
7250	*/
7251	if (!remaining &&
7252	prepare_kswapd_sleep(pgdat, order: reclaim_order, highest_zoneidx)) {
7253	trace_mm_vmscan_kswapd_sleep(nid: pgdat->node_id);
7254
7255	/*
7256	* vmstat counters are not perfectly accurate and the estimated
7257	* value for counters such as NR_FREE_PAGES can deviate from the
7258	* true value by nr_online_cpus * threshold. To avoid the zone
7259	* watermarks being breached while under pressure, we reduce the
7260	* per-cpu vmstat threshold while kswapd is awake and restore
7261	* them before going back to sleep.
7262	*/
7263	set_pgdat_percpu_threshold(pgdat, calculate_pressure: calculate_normal_threshold);
7264
7265	if (!kthread_should_stop())
7266	schedule();
7267
7268	set_pgdat_percpu_threshold(pgdat, calculate_pressure: calculate_pressure_threshold);
7269	} else {
7270	if (remaining)
7271	count_vm_event(item: KSWAPD_LOW_WMARK_HIT_QUICKLY);
7272	else
7273	count_vm_event(item: KSWAPD_HIGH_WMARK_HIT_QUICKLY);
7274	}
7275	finish_wait(wq_head: &pgdat->kswapd_wait, wq_entry: &wait);
7276	}
7277
7278	/*
7279	* The background pageout daemon, started as a kernel thread
7280	* from the init process.
7281	*
7282	* This basically trickles out pages so that we have _some_
7283	* free memory available even if there is no other activity
7284	* that frees anything up. This is needed for things like routing
7285	* etc, where we otherwise might have all activity going on in
7286	* asynchronous contexts that cannot page things out.
7287	*
7288	* If there are applications that are active memory-allocators
7289	* (most normal use), this basically shouldn't matter.
7290	*/
7291	static int kswapd(void *p)
7292	{
7293	unsigned int alloc_order, reclaim_order;
7294	unsigned int highest_zoneidx = MAX_NR_ZONES - `1`;
7295	pg_data_t pgdat = (pg_data_t )p;
7296	struct task_struct *tsk = current;
7297
7298	/*
7299	* Tell the memory management that we're a "memory allocator",
7300	* and that if we need more memory we should get access to it
7301	* regardless (see "__alloc_pages()"). "kswapd" should
7302	* never get caught in the normal page freeing logic.
7303	*
7304	* (Kswapd normally doesn't need memory anyway, but sometimes
7305	* you need a small amount of memory in order to be able to
7306	* page out something else, and this flag essentially protects
7307	* us from recursively trying to free more memory as we're
7308	* trying to free the first piece of memory in the first place).
7309	*/
7310	tsk->flags \|= PF_MEMALLOC \| PF_KSWAPD;
7311	set_freezable();
7312
7313	WRITE_ONCE(pgdat->kswapd_order, `0`);
7314	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7315	atomic_set(v: &pgdat->nr_writeback_throttled, i: `0`);
7316	for ( ; ; ) {
7317	bool was_frozen;
7318
7319	alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
7320	highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7321	prev_highest_zoneidx: highest_zoneidx);
7322
7323	kswapd_try_sleep:
7324	kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
7325	highest_zoneidx);
7326
7327	/ Read the new order and highest_zoneidx /
7328	alloc_order = READ_ONCE(pgdat->kswapd_order);
7329	highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7330	prev_highest_zoneidx: highest_zoneidx);
7331	WRITE_ONCE(pgdat->kswapd_order, `0`);
7332	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7333
7334	if (kthread_freezable_should_stop(was_frozen: &was_frozen))
7335	break;
7336
7337	/*
7338	* We can speed up thawing tasks if we don't call balance_pgdat
7339	* after returning from the refrigerator
7340	*/
7341	if (was_frozen)
7342	continue;
7343
7344	/*
7345	* Reclaim begins at the requested order but if a high-order
7346	* reclaim fails then kswapd falls back to reclaiming for
7347	* order-0. If that happens, kswapd will consider sleeping
7348	* for the order it finished reclaiming at (reclaim_order)
7349	* but kcompactd is woken to compact for the original
7350	* request (alloc_order).
7351	*/
7352	trace_mm_vmscan_kswapd_wake(nid: pgdat->node_id, zid: highest_zoneidx,
7353	order: alloc_order);
7354	reclaim_order = balance_pgdat(pgdat, order: alloc_order,
7355	highest_zoneidx);
7356	if (reclaim_order < alloc_order)
7357	goto kswapd_try_sleep;
7358	}
7359
7360	tsk->flags &= ~(PF_MEMALLOC \| PF_KSWAPD);
7361
7362	return `0`;
7363	}
7364
7365	/*
7366	* A zone is low on free memory or too fragmented for high-order memory. If
7367	* kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
7368	* pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
7369	* has failed or is not needed, still wake up kcompactd if only compaction is
7370	* needed.
7371	*/
7372	void wakeup_kswapd(struct zone zone, gfp_t gfp_flags, int* order,
7373	enum zone_type highest_zoneidx)
7374	{
7375	pg_data_t *pgdat;
7376	enum zone_type curr_idx;
7377
7378	if (!managed_zone(zone))
7379	return;
7380
7381	if (!cpuset_zone_allowed(z: zone, gfp_mask: gfp_flags))
7382	return;
7383
7384	pgdat = zone->zone_pgdat;
7385	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7386
7387	if (curr_idx == MAX_NR_ZONES \|\| curr_idx < highest_zoneidx)
7388	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
7389
7390	if (READ_ONCE(pgdat->kswapd_order) < order)
7391	WRITE_ONCE(pgdat->kswapd_order, order);
7392
7393	if (!waitqueue_active(wq_head: &pgdat->kswapd_wait))
7394	return;
7395
7396	/ Hopeless node, leave it to direct reclaim if possible /
7397	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES \|\|
7398	(pgdat_balanced(pgdat, order, highest_zoneidx) &&
7399	!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
7400	/*
7401	* There may be plenty of free memory available, but it's too
7402	* fragmented for high-order allocations. Wake up kcompactd
7403	* and rely on compaction_suitable() to determine if it's
7404	* needed. If it fails, it will defer subsequent attempts to
7405	* ratelimit its work.
7406	*/
7407	if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
7408	wakeup_kcompactd(pgdat, order, highest_zoneidx);
7409	return;
7410	}
7411
7412	trace_mm_vmscan_wakeup_kswapd(nid: pgdat->node_id, zid: highest_zoneidx, order,
7413	gfp_flags);
7414	wake_up_interruptible(&pgdat->kswapd_wait);
7415	}
7416
7417	#ifdef CONFIG_HIBERNATION
7418	/*
7419	* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
7420	* freed pages.
7421	*
7422	* Rather than trying to age LRUs the aim is to preserve the overall
7423	* LRU order by reclaiming preferentially
7424	* inactive > active > active referenced > active mapped
7425	*/
7426	unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
7427	{
7428	struct scan_control sc = {
7429	.nr_to_reclaim = nr_to_reclaim,
7430	.gfp_mask = GFP_HIGHUSER_MOVABLE,
7431	.reclaim_idx = MAX_NR_ZONES - `1`,
7432	.priority = DEF_PRIORITY,
7433	.may_writepage = `1`,
7434	.may_unmap = `1`,
7435	.may_swap = `1`,
7436	.hibernation_mode = `1`,
7437	};
7438	struct zonelist *zonelist = node_zonelist(nid: numa_node_id(), flags: sc.gfp_mask);
7439	unsigned long nr_reclaimed;
7440	unsigned int noreclaim_flag;
7441
7442	fs_reclaim_acquire(gfp_mask: sc.gfp_mask);
7443	noreclaim_flag = memalloc_noreclaim_save();
7444	set_task_reclaim_state(current, rs: &sc.reclaim_state);
7445
7446	nr_reclaimed = do_try_to_free_pages(zonelist, sc: &sc);
7447
7448	set_task_reclaim_state(current, NULL);
7449	memalloc_noreclaim_restore(flags: noreclaim_flag);
7450	fs_reclaim_release(gfp_mask: sc.gfp_mask);
7451
7452	return nr_reclaimed;
7453	}
7454	#endif /* CONFIG_HIBERNATION */
7455
7456	/*
7457	* This kswapd start function will be called by init and node-hot-add.
7458	*/
7459	void __meminit kswapd_run(int nid)
7460	{
7461	pg_data_t *pgdat = NODE_DATA(nid);
7462
7463	pgdat_kswapd_lock(pgdat);
7464	if (!pgdat->kswapd) {
7465	pgdat->kswapd = kthread_create_on_node(threadfn: kswapd, data: pgdat, node: nid, namefmt: "kswapd%d", nid);
7466	if (IS_ERR(ptr: pgdat->kswapd)) {
7467	/ failure at boot is fatal /
7468	pr_err("Failed to start kswapd on node %d，ret=%ld\n",
7469	nid, PTR_ERR(pgdat->kswapd));
7470	BUG_ON(system_state < SYSTEM_RUNNING);
7471	pgdat->kswapd = NULL;
7472	} else {
7473	wake_up_process(tsk: pgdat->kswapd);
7474	}
7475	}
7476	pgdat_kswapd_unlock(pgdat);
7477	}
7478
7479	/*
7480	* Called by memory hotplug when all memory in a node is offlined. Caller must
7481	* be holding mem_hotplug_begin/done().
7482	*/
7483	void __meminit kswapd_stop(int nid)
7484	{
7485	pg_data_t *pgdat = NODE_DATA(nid);
7486	struct task_struct *kswapd;
7487
7488	pgdat_kswapd_lock(pgdat);
7489	kswapd = pgdat->kswapd;
7490	if (kswapd) {
7491	kthread_stop(k: kswapd);
7492	pgdat->kswapd = NULL;
7493	}
7494	pgdat_kswapd_unlock(pgdat);
7495	}
7496
7497	static const struct ctl_table vmscan_sysctl_table[] = {
7498	{
7499	.procname = "swappiness",
7500	.data = &vm_swappiness,
7501	.maxlen = sizeof(vm_swappiness),
7502	.mode = `0644`,
7503	.proc_handler = proc_dointvec_minmax,
7504	.extra1 = SYSCTL_ZERO,
7505	.extra2 = SYSCTL_TWO_HUNDRED,
7506	},
7507	#ifdef CONFIG_NUMA
7508	{
7509	.procname = "zone_reclaim_mode",
7510	.data = &node_reclaim_mode,
7511	.maxlen = sizeof(node_reclaim_mode),
7512	.mode = `0644`,
7513	.proc_handler = proc_dointvec_minmax,
7514	.extra1 = SYSCTL_ZERO,
7515	}
7516	#endif
7517	};
7518
7519	static int __init kswapd_init(void)
7520	{
7521	int nid;
7522
7523	swap_setup();
7524	for_each_node_state(nid, N_MEMORY)
7525	kswapd_run(nid);
7526	register_sysctl_init("vm", vmscan_sysctl_table);
7527	return `0`;
7528	}
7529
7530	module_init(kswapd_init)
7531
7532	#ifdef CONFIG_NUMA
7533	/*
7534	* Node reclaim mode
7535	*
7536	* If non-zero call node_reclaim when the number of free pages falls below
7537	* the watermarks.
7538	*/
7539	int node_reclaim_mode __read_mostly;
7540
7541	/*
7542	* Priority for NODE_RECLAIM. This determines the fraction of pages
7543	* of a node considered for each zone_reclaim. 4 scans 1/16th of
7544	* a zone.
7545	*/
7546	#define NODE_RECLAIM_PRIORITY 4
7547
7548	/*
7549	* Percentage of pages in a zone that must be unmapped for node_reclaim to
7550	* occur.
7551	*/
7552	int sysctl_min_unmapped_ratio = `1`;
7553
7554	/*
7555	* If the number of slab pages in a zone grows beyond this percentage then
7556	* slab reclaim needs to occur.
7557	*/
7558	int sysctl_min_slab_ratio = `5`;
7559
7560	static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
7561	{
7562	unsigned long file_mapped = node_page_state(pgdat, item: NR_FILE_MAPPED);
7563	unsigned long file_lru = node_page_state(pgdat, item: NR_INACTIVE_FILE) +
7564	node_page_state(pgdat, item: NR_ACTIVE_FILE);
7565
7566	/*
7567	* It's possible for there to be more file mapped pages than
7568	* accounted for by the pages on the file LRU lists because
7569	* tmpfs pages accounted for as ANON can also be FILE_MAPPED
7570	*/
7571	return (file_lru > file_mapped) ? (file_lru - file_mapped) : `0`;
7572	}
7573
7574	/ Work out how many page cache pages we can reclaim in this reclaim_mode /
7575	static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
7576	{
7577	unsigned long nr_pagecache_reclaimable;
7578	unsigned long delta = `0`;
7579
7580	/*
7581	* If RECLAIM_UNMAP is set, then all file pages are considered
7582	* potentially reclaimable. Otherwise, we have to worry about
7583	* pages like swapcache and node_unmapped_file_pages() provides
7584	* a better estimate
7585	*/
7586	if (node_reclaim_mode & RECLAIM_UNMAP)
7587	nr_pagecache_reclaimable = node_page_state(pgdat, item: NR_FILE_PAGES);
7588	else
7589	nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
7590
7591	/*
7592	* Since we can't clean folios through reclaim, remove dirty file
7593	* folios from consideration.
7594	*/
7595	delta += node_page_state(pgdat, item: NR_FILE_DIRTY);
7596
7597	/ Watch for any possible underflows due to delta /
7598	if (unlikely(delta > nr_pagecache_reclaimable))
7599	delta = nr_pagecache_reclaimable;
7600
7601	return nr_pagecache_reclaimable - delta;
7602	}
7603
7604	/*
7605	* Try to free up some pages from this node through reclaim.
7606	*/
7607	static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
7608	unsigned long nr_pages,
7609	struct scan_control *sc)
7610	{
7611	struct task_struct *p = current;
7612	unsigned int noreclaim_flag;
7613	unsigned long pflags;
7614
7615	trace_mm_vmscan_node_reclaim_begin(nid: pgdat->node_id, order: sc->order,
7616	gfp_flags: sc->gfp_mask);
7617
7618	cond_resched();
7619	psi_memstall_enter(flags: &pflags);
7620	delayacct_freepages_start();
7621	fs_reclaim_acquire(gfp_mask: sc->gfp_mask);
7622	/*
7623	* We need to be able to allocate from the reserves for RECLAIM_UNMAP
7624	*/
7625	noreclaim_flag = memalloc_noreclaim_save();
7626	set_task_reclaim_state(task: p, rs: &sc->reclaim_state);
7627
7628	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages \|\|
7629	node_page_state_pages(pgdat, item: NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
7630	/*
7631	* Free memory by calling shrink node with increasing
7632	* priorities until we have enough memory freed.
7633	*/
7634	do {
7635	shrink_node(pgdat, sc);
7636	} while (sc->nr_reclaimed < nr_pages && --sc->priority >= `0`);
7637	}
7638
7639	set_task_reclaim_state(task: p, NULL);
7640	memalloc_noreclaim_restore(flags: noreclaim_flag);
7641	fs_reclaim_release(gfp_mask: sc->gfp_mask);
7642	delayacct_freepages_end();
7643	psi_memstall_leave(flags: &pflags);
7644
7645	trace_mm_vmscan_node_reclaim_end(nr_reclaimed: sc->nr_reclaimed);
7646
7647	return sc->nr_reclaimed;
7648	}
7649
7650	int node_reclaim(struct pglist_data pgdat, gfp_t gfp_mask, unsigned* int order)
7651	{
7652	int ret;
7653	/ Minimum pages needed in order to stay on node /
7654	const unsigned long nr_pages = `1` << order;
7655	struct scan_control sc = {
7656	.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
7657	.gfp_mask = current_gfp_context(flags: gfp_mask),
7658	.order = order,
7659	.priority = NODE_RECLAIM_PRIORITY,
7660	.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
7661	.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
7662	.may_swap = `1`,
7663	.reclaim_idx = gfp_zone(flags: gfp_mask),
7664	};
7665
7666	/*
7667	* Node reclaim reclaims unmapped file backed pages and
7668	* slab pages if we are over the defined limits.
7669	*
7670	* A small portion of unmapped file backed pages is needed for
7671	* file I/O otherwise pages read by file I/O will be immediately
7672	* thrown out if the node is overallocated. So we do not reclaim
7673	* if less than a specified percentage of the node is used by
7674	* unmapped file backed pages.
7675	*/
7676	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
7677	node_page_state_pages(pgdat, item: NR_SLAB_RECLAIMABLE_B) <=
7678	pgdat->min_slab_pages)
7679	return NODE_RECLAIM_FULL;
7680
7681	/*
7682	* Do not scan if the allocation should not be delayed.
7683	*/
7684	if (!gfpflags_allow_blocking(gfp_flags: gfp_mask) \|\| (current->flags & PF_MEMALLOC))
7685	return NODE_RECLAIM_NOSCAN;
7686
7687	/*
7688	* Only run node reclaim on the local node or on nodes that do not
7689	* have associated processors. This will favor the local processor
7690	* over remote processors and spread off node memory allocations
7691	* as wide as possible.
7692	*/
7693	if (node_state(node: pgdat->node_id, state: N_CPU) && pgdat->node_id != numa_node_id())
7694	return NODE_RECLAIM_NOSCAN;
7695
7696	if (test_and_set_bit_lock(nr: PGDAT_RECLAIM_LOCKED, addr: &pgdat->flags))
7697	return NODE_RECLAIM_NOSCAN;
7698
7699	ret = __node_reclaim(pgdat, gfp_mask, nr_pages, sc: &sc) >= nr_pages;
7700	clear_bit_unlock(nr: PGDAT_RECLAIM_LOCKED, addr: &pgdat->flags);
7701
7702	if (ret)
7703	count_vm_event(item: PGSCAN_ZONE_RECLAIM_SUCCESS);
7704	else
7705	count_vm_event(item: PGSCAN_ZONE_RECLAIM_FAILED);
7706
7707	return ret;
7708	}
7709
7710	#else
7711
7712	static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
7713	unsigned long nr_pages,
7714	struct scan_control *sc)
7715	{
7716	return `0`;
7717	}
7718
7719	#endif
7720
7721	enum {
7722	MEMORY_RECLAIM_SWAPPINESS = `0`,
7723	MEMORY_RECLAIM_SWAPPINESS_MAX,
7724	MEMORY_RECLAIM_NULL,
7725	};
7726	static const match_table_t tokens = {
7727	{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
7728	{ MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
7729	{ MEMORY_RECLAIM_NULL, NULL },
7730	};
7731
7732	int user_proactive_reclaim(char *buf,
7733	struct mem_cgroup memcg, pg_data_t pgdat)
7734	{
7735	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
7736	unsigned long nr_to_reclaim, nr_reclaimed = `0`;
7737	int swappiness = -`1`;
7738	char old_buf, start;
7739	substring_t args[MAX_OPT_ARGS];
7740	gfp_t gfp_mask = GFP_KERNEL;
7741
7742	if (!buf \|\| (!memcg && !pgdat) \|\| (memcg && pgdat))
7743	return -EINVAL;
7744
7745	buf = strstrip(str: buf);
7746
7747	old_buf = buf;
7748	nr_to_reclaim = memparse(ptr: buf, retptr: &buf) / PAGE_SIZE;
7749	if (buf == old_buf)
7750	return -EINVAL;
7751
7752	buf = strstrip(str: buf);
7753
7754	while ((start = strsep(&buf, " ")) != NULL) {
7755	if (!strlen(start))
7756	continue;
7757	switch (match_token(start, table: tokens, args)) {
7758	case MEMORY_RECLAIM_SWAPPINESS:
7759	if (match_int(&args[`0`], result: &swappiness))
7760	return -EINVAL;
7761	if (swappiness < MIN_SWAPPINESS \|\|
7762	swappiness > MAX_SWAPPINESS)
7763	return -EINVAL;
7764	break;
7765	case MEMORY_RECLAIM_SWAPPINESS_MAX:
7766	swappiness = SWAPPINESS_ANON_ONLY;
7767	break;
7768	default:
7769	return -EINVAL;
7770	}
7771	}
7772
7773	while (nr_reclaimed < nr_to_reclaim) {
7774	/ Will converge on zero, but reclaim enforces a minimum /
7775	unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / `4`;
7776	unsigned long reclaimed;
7777
7778	if (signal_pending(current))
7779	return -EINTR;
7780
7781	/*
7782	* This is the final attempt, drain percpu lru caches in the
7783	* hope of introducing more evictable pages.
7784	*/
7785	if (!nr_retries)
7786	lru_add_drain_all();
7787
7788	if (memcg) {
7789	unsigned int reclaim_options;
7790
7791	reclaim_options = MEMCG_RECLAIM_MAY_SWAP \|
7792	MEMCG_RECLAIM_PROACTIVE;
7793	reclaimed = try_to_free_mem_cgroup_pages(memcg,
7794	nr_pages: batch_size, gfp_mask,
7795	reclaim_options,
7796	swappiness: swappiness == -`1` ? NULL : &swappiness);
7797	} else {
7798	struct scan_control sc = {
7799	.gfp_mask = current_gfp_context(flags: gfp_mask),
7800	.reclaim_idx = gfp_zone(flags: gfp_mask),
7801	.proactive_swappiness = swappiness == -`1` ? NULL : &swappiness,
7802	.priority = DEF_PRIORITY,
7803	.may_writepage = !laptop_mode,
7804	.nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
7805	.may_unmap = `1`,
7806	.may_swap = `1`,
7807	.proactive = `1`,
7808	};
7809
7810	if (test_and_set_bit_lock(nr: PGDAT_RECLAIM_LOCKED,
7811	addr: &pgdat->flags))
7812	return -EBUSY;
7813
7814	reclaimed = __node_reclaim(pgdat, gfp_mask,
7815	nr_pages: batch_size, sc: &sc);
7816	clear_bit_unlock(nr: PGDAT_RECLAIM_LOCKED, addr: &pgdat->flags);
7817	}
7818
7819	if (!reclaimed && !nr_retries--)
7820	return -EAGAIN;
7821
7822	nr_reclaimed += reclaimed;
7823	}
7824
7825	return `0`;
7826	}
7827
7828	/**
7829	* check_move_unevictable_folios - Move evictable folios to appropriate zone
7830	* lru list
7831	* @fbatch: Batch of lru folios to check.
7832	*
7833	* Checks folios for evictability, if an evictable folio is in the unevictable
7834	* lru list, moves it to the appropriate evictable lru list. This function
7835	* should be only used for lru folios.
7836	*/
7837	void check_move_unevictable_folios(struct folio_batch *fbatch)
7838	{
7839	struct lruvec *lruvec = NULL;
7840	int pgscanned = `0`;
7841	int pgrescued = `0`;
7842	int i;
7843
7844	for (i = `0`; i < fbatch->nr; i++) {
7845	struct folio *folio = fbatch->folios[i];
7846	int nr_pages = folio_nr_pages(folio);
7847
7848	pgscanned += nr_pages;
7849
7850	/ block memcg migration while the folio moves between lrus /
7851	if (!folio_test_clear_lru(folio))
7852	continue;
7853
7854	lruvec = folio_lruvec_relock_irq(folio, locked_lruvec: lruvec);
7855	if (folio_evictable(folio) && folio_test_unevictable(folio)) {
7856	lruvec_del_folio(lruvec, folio);
7857	folio_clear_unevictable(folio);
7858	lruvec_add_folio(lruvec, folio);
7859	pgrescued += nr_pages;
7860	}
7861	folio_set_lru(folio);
7862	}
7863
7864	if (lruvec) {
7865	__count_vm_events(item: UNEVICTABLE_PGRESCUED, delta: pgrescued);
7866	__count_vm_events(item: UNEVICTABLE_PGSCANNED, delta: pgscanned);
7867	unlock_page_lruvec_irq(lruvec);
7868	} else if (pgscanned) {
7869	count_vm_events(item: UNEVICTABLE_PGSCANNED, delta: pgscanned);
7870	}
7871	}
7872	EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
7873
7874	#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
7875	static ssize_t reclaim_store(struct device *dev,
7876	struct device_attribute *attr,
7877	const char *buf, size_t count)
7878	{
7879	int ret, nid = dev->id;
7880
7881	ret = user_proactive_reclaim(buf: (char *)buf, NULL, NODE_DATA(nid));
7882	return ret ? -EAGAIN : count;
7883	}
7884
7885	static DEVICE_ATTR_WO(reclaim);
7886	int reclaim_register_node(struct node *node)
7887	{
7888	return device_create_file(device: &node->dev, entry: &dev_attr_reclaim);
7889	}
7890
7891	void reclaim_unregister_node(struct node *node)
7892	{
7893	return device_remove_file(dev: &node->dev, attr: &dev_attr_reclaim);
7894	}
7895	#endif
7896

source code of linux/mm/vmscan.c