The following commit has been merged in the master branch: commit 5b200f578960a9635918a0ed41be3d8dc90186bf Merge: 3db1a3fa98808aa90f95ec3e0fa2fc7abf28f5c9 15b447361794271f4d03c04d82276a841fe06328 Author: Linus Torvalds torvalds@linux-foundation.org Date: Tue Dec 15 14:55:10 2020 -0800
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "More MM work: a memcg scalability improvememt"
* emailed patches from Andrew Morton akpm@linux-foundation.org: mm/lru: revise the comments of lru_lock mm/lru: introduce relock_page_lruvec() mm/lru: replace pgdat lru_lock with lruvec lock mm/swap.c: serialize memcg changes in pagevec_lru_move_fn mm/compaction: do page isolation first in compaction mm/lru: introduce TestClearPageLRU() mm/mlock: remove __munlock_isolate_lru_page() mm/mlock: remove lru_lock on TestClearPageMlocked mm/vmscan: remove lruvec reget in move_pages_to_lru mm/lru: move lock into lru_note_cost mm/swap.c: fold vm event PGROTATED into pagevec_move_tail_fn mm/memcg: add debug checking in lock_page_memcg mm: page_idle_get_page() does not need lru_lock mm/rmap: stop store reordering issue on page->mapping mm/vmscan: remove unnecessary lruvec adding mm/thp: narrow lru locking mm/thp: simplify lru_add_page_tail() mm/thp: use head for head page in lru_add_page_tail() mm/thp: move lru_add_page_tail() to huge_memory.c
diff --combined include/linux/memcontrol.h index f5b4d710f099,ff02f831e7e1..08ed57e02b73 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@@ -337,175 -337,6 +337,175 @@@ struct mem_cgroup
extern struct mem_cgroup *root_mem_cgroup;
+enum page_memcg_data_flags { + /* page->memcg_data is a pointer to an objcgs vector */ + MEMCG_DATA_OBJCGS = (1UL << 0), + /* page has been accounted as a non-slab kernel page */ + MEMCG_DATA_KMEM = (1UL << 1), + /* the next bit after the last actual flag */ + __NR_MEMCG_DATA_FLAGS = (1UL << 2), +}; + +#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) + +/* + * page_memcg - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * page_memcg_rcu - locklessly get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + */ +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + WARN_ON_ONCE(!rcu_read_lock_held()); + + return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) & + ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * page_memcg_check - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function unlike page_memcg() can take any page + * as an argument. It has to be used in cases when it's not known if a page + * has an associated memory cgroup pointer or an object cgroups vector. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + /* + * Because page->memcg_data might be changed asynchronously + * for slab pages, READ_ONCE() should be used here. + */ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + if (memcg_data & MEMCG_DATA_OBJCGS) + return NULL; + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * PageMemcgKmem - check if the page has MemcgKmem flag set + * @page: a pointer to the page struct + * + * Checks if the page has MemcgKmem flag set. The caller must ensure that + * the page has an associated memory cgroup. It's not safe to call this function + * against some types of pages, e.g. slab pages. + */ +static inline bool PageMemcgKmem(struct page *page) +{ + VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page); + return page->memcg_data & MEMCG_DATA_KMEM; +} + +#ifdef CONFIG_MEMCG_KMEM +/* + * page_objcgs - get the object cgroups vector associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroups vector associated with the page, + * or NULL. This function assumes that the page is known to have an + * associated object cgroups vector. It's not safe to call this function + * against pages, which might have an associated memory cgroup: e.g. + * kernel stack pages. + */ +static inline struct obj_cgroup **page_objcgs(struct page *page) +{ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * page_objcgs_check - get the object cgroups vector associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroups vector associated with the page, + * or NULL. This function is safe to use if the page can be directly associated + * with a memory cgroup. + */ +static inline struct obj_cgroup **page_objcgs_check(struct page *page) +{ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) + return NULL; + + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * set_page_objcgs - associate a page with a object cgroups vector + * @page: a pointer to the page struct + * @objcgs: a pointer to the object cgroups vector + * + * Atomically associates a page with a vector of object cgroups. + */ +static inline bool set_page_objcgs(struct page *page, + struct obj_cgroup **objcgs) +{ + return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | + MEMCG_DATA_OBJCGS); +} +#else +static inline struct obj_cgroup **page_objcgs(struct page *page) +{ + return NULL; +} + +static inline struct obj_cgroup **page_objcgs_check(struct page *page) +{ + return NULL; +} + +static inline bool set_page_objcgs(struct page *page, + struct obj_cgroup **objcgs) +{ + return true; +} +#endif + static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) @@@ -654,12 -485,41 +654,41 @@@ out
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
+ static inline bool lruvec_holds_page_lru_lock(struct page *page, + struct lruvec *lruvec) + { + pg_data_t *pgdat = page_pgdat(page); + const struct mem_cgroup *memcg; + struct mem_cgroup_per_node *mz; + + if (mem_cgroup_disabled()) + return lruvec == &pgdat->__lruvec; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = page_memcg(page) ? : root_mem_cgroup; + + return lruvec->pgdat == pgdat && mz->memcg == memcg; + } + struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
+ struct lruvec *lock_page_lruvec(struct page *page); + struct lruvec *lock_page_lruvec_irq(struct page *page); + struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flags); + + #ifdef CONFIG_DEBUG_VM + void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page); + #else + static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) + { + } + #endif + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@@ -904,19 -764,15 +933,19 @@@ static inline void mod_memcg_state(stru static inline void __mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - __mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + __mod_memcg_state(memcg, idx, val); }
static inline void mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + mod_memcg_state(memcg, idx, val); }
static inline unsigned long lruvec_page_state(struct lruvec *lruvec, @@@ -1002,10 -858,8 +1031,10 @@@ static inline void count_memcg_events(s static inline void count_memcg_page_event(struct page *page, enum vm_event_item idx) { - if (page->mem_cgroup) - count_memcg_events(page->mem_cgroup, idx, 1); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + count_memcg_events(memcg, idx, 1); }
static inline void count_memcg_event_mm(struct mm_struct *mm, @@@ -1074,27 -928,6 +1103,27 @@@ void mem_cgroup_split_huge_fixup(struc
struct mem_cgroup;
+static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return NULL; +} + +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + return NULL; +} + +static inline bool PageMemcgKmem(struct page *page) +{ + return false; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; @@@ -1167,6 -1000,14 +1196,14 @@@ static inline struct lruvec *mem_cgroup return &pgdat->__lruvec; }
+ static inline bool lruvec_holds_page_lru_lock(struct page *page, + struct lruvec *lruvec) + { + pg_data_t *pgdat = page_pgdat(page); + + return lruvec == &pgdat->__lruvec; + } + static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; @@@ -1192,6 -1033,31 +1229,31 @@@ static inline void mem_cgroup_put(struc { }
+ static inline struct lruvec *lock_page_lruvec(struct page *page) + { + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; + } + + static inline struct lruvec *lock_page_lruvec_irq(struct page *page) + { + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irq(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; + } + + static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flagsp) + { + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); + return &pgdat->__lruvec; + } + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@@ -1411,6 -1277,10 +1473,10 @@@ static inlin void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + + static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) + { + } #endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */ @@@ -1492,6 -1362,50 +1558,50 @@@ static inline struct lruvec *parent_lru return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); }
+ static inline void unlock_page_lruvec(struct lruvec *lruvec) + { + spin_unlock(&lruvec->lru_lock); + } + + static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) + { + spin_unlock_irq(&lruvec->lru_lock); + } + + static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, + unsigned long flags) + { + spin_unlock_irqrestore(&lruvec->lru_lock, flags); + } + + /* Don't lock again iff page's lruvec locked */ + static inline struct lruvec *relock_page_lruvec_irq(struct page *page, + struct lruvec *locked_lruvec) + { + if (locked_lruvec) { + if (lruvec_holds_page_lru_lock(page, locked_lruvec)) + return locked_lruvec; + + unlock_page_lruvec_irq(locked_lruvec); + } + + return lock_page_lruvec_irq(page); + } + + /* Don't lock again iff page's lruvec locked */ + static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page, + struct lruvec *locked_lruvec, unsigned long *flags) + { + if (locked_lruvec) { + if (lruvec_holds_page_lru_lock(page, locked_lruvec)) + return locked_lruvec; + + unlock_page_lruvec_irqrestore(locked_lruvec, *flags); + } + + return lock_page_lruvec_irqsave(page, flags); + } + #ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); @@@ -1508,7 -1422,7 +1618,7 @@@ static inline void mem_cgroup_track_for if (mem_cgroup_disabled()) return;
- if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) + if (unlikely(&page_memcg(page)->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); }
diff --combined include/linux/mm_types.h index e7de072ade03,a9688cc55964..07d9acb5b19c --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@@ -79,7 -79,7 +79,7 @@@ struct page struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by - * pgdat->lru_lock. Sometimes used as a generic list + * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ struct list_head lru; @@@ -200,7 -200,10 +200,7 @@@ atomic_t _refcount;
#ifdef CONFIG_MEMCG - union { - struct mem_cgroup *mem_cgroup; - struct obj_cgroup **obj_cgroups; - }; + unsigned long memcg_data; #endif
/* diff --combined include/linux/page-flags.h index b5eb0fc15053,f8b4375ce1b3..ec5d0290e0ee --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@@ -334,6 -334,7 +334,7 @@@ PAGEFLAG(Referenced, referenced, PF_HEA PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) + TESTCLEARFLAG(LRU, lru, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) PAGEFLAG(Workingset, workingset, PF_HEAD) @@@ -713,8 -714,9 +714,8 @@@ PAGEFLAG_FALSE(DoubleMap #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 #define PG_offline 0x00000100 -#define PG_kmemcg 0x00000200 -#define PG_table 0x00000400 -#define PG_guard 0x00000800 +#define PG_table 0x00000200 +#define PG_guard 0x00000400
#define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@@ -765,6 -767,12 +766,6 @@@ PAGE_TYPE_OPS(Buddy, buddy */ PAGE_TYPE_OPS(Offline, offline)
-/* - * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on - * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. - */ -PAGE_TYPE_OPS(Kmemcg, kmemcg) - /* * Marks pages in use as page tables. */ diff --combined mm/huge_memory.c index e425bbf6711a,3c4a8fc9102f..1efe2b5ad59a --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@@ -484,7 -484,7 +484,7 @@@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, stru #ifdef CONFIG_MEMCG static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
if (memcg) @@@ -2359,6 -2359,27 +2359,27 @@@ static void remap_page(struct page *pag } }
+ static void lru_add_page_tail(struct page *head, struct page *tail, + struct lruvec *lruvec, struct list_head *list) + { + VM_BUG_ON_PAGE(!PageHead(head), head); + VM_BUG_ON_PAGE(PageCompound(tail), head); + VM_BUG_ON_PAGE(PageLRU(tail), head); + lockdep_assert_held(&lruvec->lru_lock); + + if (list) { + /* page reclaim is reclaiming a huge page */ + VM_WARN_ON(PageLRU(head)); + get_page(tail); + list_add_tail(&tail->lru, list); + } else { + /* head is still on lru (and we have it frozen) */ + VM_WARN_ON(!PageLRU(head)); + SetPageLRU(tail); + list_add_tail(&tail->lru, &head->lru); + } + } + static void __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { @@@ -2425,18 -2446,15 +2446,15 @@@ }
static void __split_huge_page(struct page *page, struct list_head *list, - pgoff_t end, unsigned long flags) + pgoff_t end) { struct page *head = compound_head(page); - pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); int i;
- lruvec = mem_cgroup_page_lruvec(head, pgdat); - /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head);
@@@ -2448,6 -2466,9 +2466,9 @@@ xa_lock(&swap_cache->i_pages); }
+ /* lock lru list/PageCompound, ref freezed by page_ref_freeze */ + lruvec = lock_page_lruvec(head); + for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@@ -2467,6 -2488,8 +2488,8 @@@ }
ClearPageCompound(head); + unlock_page_lruvec(lruvec); + /* Caller disabled irqs, so they are still disabled here */
split_page_owner(head, nr);
@@@ -2484,8 -2507,7 +2507,7 @@@ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } - - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + local_irq_enable();
remap_page(head, nr);
@@@ -2631,12 -2653,10 +2653,10 @@@ bool can_split_huge_page(struct page *p int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); - struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); struct deferred_split *ds_queue = get_deferred_split_queue(head); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; - unsigned long flags; pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head); @@@ -2697,9 -2717,8 +2717,8 @@@ unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head);
- /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irqsave(&pgdata->lru_lock, flags); - + /* block interrupt reentry in xa_lock and spinlock */ + local_irq_disable(); if (mapping) { XA_STATE(xas, &mapping->i_pages, page_index(head));
@@@ -2729,7 -2748,7 +2748,7 @@@ __dec_lruvec_page_state(head, NR_FILE_THPS); }
- __split_huge_page(page, list, end, flags); + __split_huge_page(page, list, end); ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { @@@ -2743,7 -2762,7 +2762,7 @@@ spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); - spin_unlock_irqrestore(&pgdata->lru_lock, flags); + local_irq_enable(); remap_page(head, thp_nr_pages(head)); ret = -EBUSY; } @@@ -2778,7 -2797,7 +2797,7 @@@ void deferred_split_huge_page(struct pa { struct deferred_split *ds_queue = get_deferred_split_queue(page); #ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); #endif unsigned long flags;
diff --combined mm/memcontrol.c index 9c5b14fe360e,2f7824d0c897..e3c7ca7dc174 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@@ -20,6 -20,9 +20,9 @@@ * Lockless page tracking & accounting * Unified hierarchy configuration model * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * + * Per memcg lru locking + * Copyright (C) 2020 Alibaba, Inc, Alex Shi */
#include <linux/page_counter.h> @@@ -533,7 -536,7 +536,7 @@@ struct cgroup_subsys_state *mem_cgroup_ { struct mem_cgroup *memcg;
- memcg = page->mem_cgroup; + memcg = page_memcg(page);
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; @@@ -560,7 -563,16 +563,7 @@@ ino_t page_cgroup_ino(struct page *page unsigned long ino = 0;
rcu_read_lock(); - memcg = page->mem_cgroup; - - /* - * The lowest bit set means that memcg isn't a valid - * memcg pointer, but a obj_cgroups pointer. - * In this case the page is shared and doesn't belong - * to any specific memory cgroup. - */ - if ((unsigned long) memcg & 0x1UL) - memcg = NULL; + memcg = page_memcg_check(page);
while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); @@@ -848,17 -860,16 +851,17 @@@ void __mod_lruvec_page_state(struct pag int val) { struct page *head = compound_head(page); /* rmap on tail pages */ + struct mem_cgroup *memcg = page_memcg(head); pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec;
/* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!head->mem_cgroup) { + if (!memcg) { __mod_node_page_state(pgdat, idx, val); return; }
- lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } EXPORT_SYMBOL(__mod_lruvec_page_state); @@@ -1049,7 -1060,7 +1052,7 @@@ EXPORT_SYMBOL(get_mem_cgroup_from_mm) */ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page);
if (mem_cgroup_disabled()) return NULL; @@@ -1322,6 -1333,23 +1325,23 @@@ int mem_cgroup_scan_tasks(struct mem_cg return ret; }
+ #ifdef CONFIG_DEBUG_VM + void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) + { + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + memcg = page_memcg(page); + + if (!memcg) + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); + else + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); + } + #endif + /** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page @@@ -1341,7 -1369,7 +1361,7 @@@ struct lruvec *mem_cgroup_page_lruvec(s goto out; }
- memcg = page->mem_cgroup; + memcg = page_memcg(page); /* * Swapcache readahead pages are added to the LRU - and * possibly migrated - before they are charged. @@@ -1362,6 -1390,60 +1382,60 @@@ out return lruvec; }
+ /** + * lock_page_lruvec - lock and return lruvec for a given page. + * @page: the page + * + * This series functions should be used in either conditions: + * PageLRU is cleared or unset + * or page->_refcount is zero + * or page is locked. + */ + struct lruvec *lock_page_lruvec(struct page *page) + { + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; + } + + struct lruvec *lock_page_lruvec_irq(struct page *page) + { + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irq(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; + } + + struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) + { + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irqsave(&lruvec->lru_lock, *flags); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; + } + /** * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector @@@ -2106,7 -2188,7 +2180,7 @@@ void mem_cgroup_print_oom_group(struct }
/** - * lock_page_memcg - lock a page->mem_cgroup binding + * lock_page_memcg - lock a page and memcg binding * @page: the page * * This function protects unlocked LRU pages from being moved to @@@ -2138,15 -2220,21 +2212,21 @@@ struct mem_cgroup *lock_page_memcg(stru if (mem_cgroup_disabled()) return NULL; again: - memcg = head->mem_cgroup; + memcg = page_memcg(head); if (unlikely(!memcg)) return NULL;
+ #ifdef CONFIG_PROVE_LOCKING + local_irq_save(flags); + might_lock(&memcg->move_lock); + local_irq_restore(flags); + #endif + if (atomic_read(&memcg->moving_account) <= 0) return memcg;
spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != head->mem_cgroup) { + if (memcg != page_memcg(head)) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@@ -2184,14 -2272,14 +2264,14 @@@ void __unlock_page_memcg(struct mem_cgr }
/** - * unlock_page_memcg - unlock a page->mem_cgroup binding + * unlock_page_memcg - unlock a page and memcg binding * @page: the page */ void unlock_page_memcg(struct page *page) { struct page *head = compound_head(page);
- __unlock_page_memcg(head->mem_cgroup); + __unlock_page_memcg(page_memcg(head)); } EXPORT_SYMBOL(unlock_page_memcg);
@@@ -2881,7 -2969,7 +2961,7 @@@ static void cancel_charge(struct mem_cg
static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - VM_BUG_ON_PAGE(page->mem_cgroup, page); + VM_BUG_ON_PAGE(page_memcg(page), page); /* * Any of the following ensures page's memcg stability: * @@@ -2890,7 -2978,7 +2970,7 @@@ * - lock_page_memcg() * - exclusive reference */ - page->mem_cgroup = memcg; + page->memcg_data = (unsigned long)memcg; }
#ifdef CONFIG_MEMCG_KMEM @@@ -2905,7 -2993,8 +2985,7 @@@ int memcg_alloc_page_obj_cgroups(struc if (!vec) return -ENOMEM;
- if (cmpxchg(&page->obj_cgroups, NULL, - (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) + if (!set_page_objcgs(page, vec)) kfree(vec); else kmemleak_not_leak(vec); @@@ -2916,12 -3005,6 +2996,12 @@@ /* * Returns a pointer to the memory cgroup to which the kernel object is charged. * + * A passed kernel object can be a slab object or a generic kernel page, so + * different mechanisms for getting the memory cgroup pointer should be used. + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller + * can not know for sure how the kernel object is implemented. + * mem_cgroup_from_obj() can be safely used in such cases. + * * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ @@@ -2934,31 -3017,36 +3014,31 @@@ struct mem_cgroup *mem_cgroup_from_obj(
page = virt_to_head_page(p);
- /* - * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer - * or a pointer to obj_cgroup vector. In the latter case the lowest - * bit of the pointer is set. - * The page->mem_cgroup pointer can be asynchronously changed - * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed - * from a valid memcg pointer to objcg vector or back. - */ - if (!page->mem_cgroup) - return NULL; - /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in * the page->obj_cgroups. */ - if (page_has_obj_cgroups(page)) { + if (page_objcgs_check(page)) { struct obj_cgroup *objcg; unsigned int off;
off = obj_to_index(page->slab_cache, page, p); - objcg = page_obj_cgroups(page)[off]; + objcg = page_objcgs(page)[off]; if (objcg) return obj_cgroup_memcg(objcg);
return NULL; }
- /* All other pages use page->mem_cgroup */ - return page->mem_cgroup; + /* + * page_memcg_check() is used here, because page_has_obj_cgroups() + * check above could fail because the object cgroups vector wasn't set + * at that moment, but it can be set concurrently. + * page_memcg_check(page) will guarantee that a proper memory + * cgroup pointer or NULL will be returned. + */ + return page_memcg_check(page); }
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) @@@ -3097,8 -3185,8 +3177,8 @@@ int __memcg_kmem_charge_page(struct pag if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { - page->mem_cgroup = memcg; - __SetPageKmemcg(page); + page->memcg_data = (unsigned long)memcg | + MEMCG_DATA_KMEM; return 0; } css_put(&memcg->css); @@@ -3113,7 -3201,7 +3193,7 @@@ */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); unsigned int nr_pages = 1 << order;
if (!memcg) @@@ -3121,8 -3209,12 +3201,8 @@@
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); __memcg_kmem_uncharge(memcg, nr_pages); - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&memcg->css); - - /* slab pages do not have PageKmemcg flag set */ - if (PageKmemcg(page)) - __ClearPageKmemcg(page); }
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@@ -3263,14 -3355,12 +3343,12 @@@ void obj_cgroup_uncharge(struct obj_cgr #endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * Because tail pages are not marked as "used", set it. We're under - * pgdat->lru_lock and migration entries setup in all page mappings. + * Because page_memcg(head) is not set on compound tails, set it now. */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct mem_cgroup *memcg = head->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(head); int i;
if (mem_cgroup_disabled()) @@@ -3278,7 -3368,7 +3356,7 @@@
for (i = 1; i < HPAGE_PMD_NR; i++) { css_get(&memcg->css); - head[i].mem_cgroup = memcg; + head[i].memcg_data = (unsigned long)memcg; } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@@ -4619,7 -4709,7 +4697,7 @@@ void mem_cgroup_wb_stats(struct bdi_wri void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; @@@ -5580,14 -5670,14 +5658,14 @@@ static int mem_cgroup_move_account(stru
/* * Prevent mem_cgroup_migrate() from looking at - * page->mem_cgroup of its source page while we change it. + * page's memory cgroup of its source page while we change it. */ ret = -EBUSY; if (!trylock_page(page)) goto out;
ret = -EINVAL; - if (page->mem_cgroup != from) + if (page_memcg(page) != from) goto out_unlock;
pgdat = page_pgdat(page); @@@ -5642,13 -5732,13 +5720,13 @@@ /* * All state has been migrated, let's switch to the new memcg. * - * It is safe to change page->mem_cgroup here because the page + * It is safe to change page's memcg here because the page * is referenced, charged, isolated, and locked: we can't race * with (un)charging, migration, LRU putback, or anything else - * that would rely on a stable page->mem_cgroup. + * that would rely on a stable page's memory cgroup. * * Note that lock_page_memcg is a memcg lock, not a page lock, - * to save space. As soon as we switch page->mem_cgroup to a + * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. */ @@@ -5657,7 -5747,7 +5735,7 @@@ css_get(&to->css); css_put(&from->css);
- page->mem_cgroup = to; + page->memcg_data = (unsigned long)to;
__unlock_page_memcg(from);
@@@ -5723,7 -5813,7 +5801,7 @@@ static enum mc_target_type get_mctgt_ty * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (is_device_private_page(page)) ret = MC_TARGET_DEVICE; @@@ -5767,7 -5857,7 +5845,7 @@@ static enum mc_target_type get_mctgt_ty VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) return ret; - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@@ -6694,12 -6784,12 +6772,12 @@@ int mem_cgroup_charge(struct page *page /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. page->mem_cgroup is protected - * by the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. + * already charged pages, too. page and memcg binding is + * protected by the page lock, which serializes swap cache + * removal, which in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound_head(page)->mem_cgroup) + if (page_memcg(compound_head(page))) goto out;
id = lookup_swap_cgroup_id(ent); @@@ -6783,21 -6873,21 +6861,21 @@@ static void uncharge_page(struct page *
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (!page->mem_cgroup) + if (!page_memcg(page)) return;
/* * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point, we have fully + * page_memcg(page) at this point, we have fully * exclusive access to the page. */
- if (ug->memcg != page->mem_cgroup) { + if (ug->memcg != page_memcg(page)) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page->mem_cgroup; + ug->memcg = page_memcg(page);
/* pairs with css_put in uncharge_batch */ css_get(&ug->memcg->css); @@@ -6806,13 -6896,15 +6884,13 @@@ nr_pages = compound_nr(page); ug->nr_pages += nr_pages;
- if (!PageKmemcg(page)) { - ug->pgpgout++; - } else { + if (PageMemcgKmem(page)) ug->nr_kmem += nr_pages; - __ClearPageKmemcg(page); - } + else + ug->pgpgout++;
ug->dummy_page = page; - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&ug->memcg->css); }
@@@ -6855,7 -6947,7 +6933,7 @@@ void mem_cgroup_uncharge(struct page *p return;
/* Don't touch page->lru of any random page, pre-check: */ - if (!page->mem_cgroup) + if (!page_memcg(page)) return;
uncharge_gather_clear(&ug); @@@ -6905,10 -6997,10 +6983,10 @@@ void mem_cgroup_migrate(struct page *ol return;
/* Page cache replacement: new page already charged? */ - if (newpage->mem_cgroup) + if (page_memcg(newpage)) return;
- memcg = oldpage->mem_cgroup; + memcg = page_memcg(oldpage); if (!memcg) return;
@@@ -7103,7 -7195,7 +7181,7 @@@ void mem_cgroup_swapout(struct page *pa if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return;
- memcg = page->mem_cgroup; + memcg = page_memcg(page);
/* Readahead page, never charged */ if (!memcg) @@@ -7124,7 -7216,7 +7202,7 @@@ VM_BUG_ON_PAGE(oldid, page); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- page->mem_cgroup = NULL; + page->memcg_data = 0;
if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); @@@ -7167,7 -7259,7 +7245,7 @@@ int mem_cgroup_try_charge_swap(struct p if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0;
- memcg = page->mem_cgroup; + memcg = page_memcg(page);
/* Readahead page, never charged */ if (!memcg) @@@ -7248,7 -7340,7 +7326,7 @@@ bool mem_cgroup_swap_full(struct page * if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false;
- memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg) return false;
diff --combined mm/page_alloc.c index db37bf231e8b,b1cc2b7483a1..3beeb8d722f3 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@@ -1101,7 -1101,7 +1101,7 @@@ static inline bool page_expected_state( if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page->mem_cgroup | + (unsigned long)page_memcg(page) | #endif (page->flags & check_flags))) return false; @@@ -1126,7 -1126,7 +1126,7 @@@ static const char *page_bad_reason(stru bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) + if (unlikely(page_memcg(page))) bad_reason = "page still charged to cgroup"; #endif return bad_reason; @@@ -1223,7 -1223,7 +1223,7 @@@ static __always_inline bool free_pages_ * Do not let hwpoison pages hit pcplists/buddy * Untie memcg state and reset page's owner */ - if (memcg_kmem_enabled() && PageKmemcg(page)) + if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); return false; @@@ -1253,7 -1253,7 +1253,7 @@@ } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageKmemcg(page)) + if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) bad += check_free_page(page); @@@ -6870,7 -6870,6 +6870,6 @@@ static void __meminit pgdat_init_intern init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat); - spin_lock_init(&pgdat->lru_lock); lruvec_init(&pgdat->__lruvec); }
diff --combined mm/workingset.c index 9abe51d51aa7,94b512538d5a..10e96de945b3 --- a/mm/workingset.c +++ b/mm/workingset.c @@@ -257,7 -257,7 +257,7 @@@ void *workingset_eviction(struct page * struct lruvec *lruvec; int memcgid;
- /* Page is fully exclusive and pins page->mem_cgroup */ + /* Page is fully exclusive and pins page's memory cgroup pointer */ VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@@ -381,9 -381,7 +381,7 @@@ void workingset_refault(struct page *pa if (workingset) { SetPageWorkingset(page); /* XXX: Move to lru_cache_add() when it supports new vs putback */ - spin_lock_irq(&page_pgdat(page)->lru_lock); lru_note_cost_page(page); - spin_unlock_irq(&page_pgdat(page)->lru_lock); inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); } out: