mirror of https://gitee.com/openkylin/linux.git
mm: vmscan: do not throttle based on pfmemalloc reserves if node has no ZONE_NORMAL
throttle_direct_reclaim() is meant to trigger during swap-over-network during which the min watermark is treated as a pfmemalloc reserve. It throttes on the first node in the zonelist but this is flawed. The user-visible impact is that a process running on CPU whose local memory node has no ZONE_NORMAL will stall for prolonged periods of time, possibly indefintely. This is due to throttle_direct_reclaim thinking the pfmemalloc reserves are depleted when in fact they don't exist on that node. On a NUMA machine running a 32-bit kernel (I know) allocation requests from CPUs on node 1 would detect no pfmemalloc reserves and the process gets throttled. This patch adjusts throttling of direct reclaim to throttle based on the first node in the zonelist that has a usable ZONE_NORMAL or lower zone. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
f98bafa06a
commit
675becce15
43
mm/vmscan.c
43
mm/vmscan.c
|
@ -2537,10 +2537,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
|
||||||
|
|
||||||
for (i = 0; i <= ZONE_NORMAL; i++) {
|
for (i = 0; i <= ZONE_NORMAL; i++) {
|
||||||
zone = &pgdat->node_zones[i];
|
zone = &pgdat->node_zones[i];
|
||||||
|
if (!populated_zone(zone))
|
||||||
|
continue;
|
||||||
|
|
||||||
pfmemalloc_reserve += min_wmark_pages(zone);
|
pfmemalloc_reserve += min_wmark_pages(zone);
|
||||||
free_pages += zone_page_state(zone, NR_FREE_PAGES);
|
free_pages += zone_page_state(zone, NR_FREE_PAGES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If there are no reserves (unexpected config) then do not throttle */
|
||||||
|
if (!pfmemalloc_reserve)
|
||||||
|
return true;
|
||||||
|
|
||||||
wmark_ok = free_pages > pfmemalloc_reserve / 2;
|
wmark_ok = free_pages > pfmemalloc_reserve / 2;
|
||||||
|
|
||||||
/* kswapd must be awake if processes are being throttled */
|
/* kswapd must be awake if processes are being throttled */
|
||||||
|
@ -2565,9 +2572,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
|
||||||
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||||
nodemask_t *nodemask)
|
nodemask_t *nodemask)
|
||||||
{
|
{
|
||||||
|
struct zoneref *z;
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
int high_zoneidx = gfp_zone(gfp_mask);
|
pg_data_t *pgdat = NULL;
|
||||||
pg_data_t *pgdat;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Kernel threads should not be throttled as they may be indirectly
|
* Kernel threads should not be throttled as they may be indirectly
|
||||||
|
@ -2586,10 +2593,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
|
||||||
if (fatal_signal_pending(current))
|
if (fatal_signal_pending(current))
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
/* Check if the pfmemalloc reserves are ok */
|
/*
|
||||||
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
|
* Check if the pfmemalloc reserves are ok by finding the first node
|
||||||
pgdat = zone->zone_pgdat;
|
* with a usable ZONE_NORMAL or lower zone. The expectation is that
|
||||||
if (pfmemalloc_watermark_ok(pgdat))
|
* GFP_KERNEL will be required for allocating network buffers when
|
||||||
|
* swapping over the network so ZONE_HIGHMEM is unusable.
|
||||||
|
*
|
||||||
|
* Throttling is based on the first usable node and throttled processes
|
||||||
|
* wait on a queue until kswapd makes progress and wakes them. There
|
||||||
|
* is an affinity then between processes waking up and where reclaim
|
||||||
|
* progress has been made assuming the process wakes on the same node.
|
||||||
|
* More importantly, processes running on remote nodes will not compete
|
||||||
|
* for remote pfmemalloc reserves and processes on different nodes
|
||||||
|
* should make reasonable progress.
|
||||||
|
*/
|
||||||
|
for_each_zone_zonelist_nodemask(zone, z, zonelist,
|
||||||
|
gfp_mask, nodemask) {
|
||||||
|
if (zone_idx(zone) > ZONE_NORMAL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Throttle based on the first usable node */
|
||||||
|
pgdat = zone->zone_pgdat;
|
||||||
|
if (pfmemalloc_watermark_ok(pgdat))
|
||||||
|
goto out;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If no zone was usable by the allocation flags then do not throttle */
|
||||||
|
if (!pgdat)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
/* Account for the throttling */
|
/* Account for the throttling */
|
||||||
|
|
Loading…
Reference in New Issue