The following commit has been merged in the master branch:
commit 5b200f578960a9635918a0ed41be3d8dc90186bf
Merge: 3db1a3fa98808aa90f95ec3e0fa2fc7abf28f5c9 15b447361794271f4d03c04d82276a841fe06328
Author: Linus Torvalds <torvalds(a)linux-foundation.org>
Date: Tue Dec 15 14:55:10 2020 -0800
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"More MM work: a memcg scalability improvememt"
* emailed patches from Andrew Morton <akpm(a)linux-foundation.org>:
mm/lru: revise the comments of lru_lock
mm/lru: introduce relock_page_lruvec()
mm/lru: replace pgdat lru_lock with lruvec lock
mm/swap.c: serialize memcg changes in pagevec_lru_move_fn
mm/compaction: do page isolation first in compaction
mm/lru: introduce TestClearPageLRU()
mm/mlock: remove __munlock_isolate_lru_page()
mm/mlock: remove lru_lock on TestClearPageMlocked
mm/vmscan: remove lruvec reget in move_pages_to_lru
mm/lru: move lock into lru_note_cost
mm/swap.c: fold vm event PGROTATED into pagevec_move_tail_fn
mm/memcg: add debug checking in lock_page_memcg
mm: page_idle_get_page() does not need lru_lock
mm/rmap: stop store reordering issue on page->mapping
mm/vmscan: remove unnecessary lruvec adding
mm/thp: narrow lru locking
mm/thp: simplify lru_add_page_tail()
mm/thp: use head for head page in lru_add_page_tail()
mm/thp: move lru_add_page_tail() to huge_memory.c
diff --combined include/linux/memcontrol.h
index f5b4d710f099,ff02f831e7e1..08ed57e02b73
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@@ -337,175 -337,6 +337,175 @@@ struct mem_cgroup
extern struct mem_cgroup *root_mem_cgroup;
+enum page_memcg_data_flags {
+ /* page->memcg_data is a pointer to an objcgs vector */
+ MEMCG_DATA_OBJCGS = (1UL << 0),
+ /* page has been accounted as a non-slab kernel page */
+ MEMCG_DATA_KMEM = (1UL << 1),
+ /* the next bit after the last actual flag */
+ __NR_MEMCG_DATA_FLAGS = (1UL << 2),
+};
+
+#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
+
+/*
+ * page_memcg - get the memory cgroup associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper memory cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages.
+ *
+ * Any of the following ensures page and memcg binding stability:
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
+ */
+static inline struct mem_cgroup *page_memcg(struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ VM_BUG_ON_PAGE(PageSlab(page), page);
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+
+ return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+/*
+ * page_memcg_rcu - locklessly get the memory cgroup associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper memory cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages.
+ */
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+ VM_BUG_ON_PAGE(PageSlab(page), page);
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
+ ~MEMCG_DATA_FLAGS_MASK);
+}
+
+/*
+ * page_memcg_check - get the memory cgroup associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function unlike page_memcg() can take any page
+ * as an argument. It has to be used in cases when it's not known if a page
+ * has an associated memory cgroup pointer or an object cgroups vector.
+ *
+ * Any of the following ensures page and memcg binding stability:
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
+ */
+static inline struct mem_cgroup *page_memcg_check(struct page *page)
+{
+ /*
+ * Because page->memcg_data might be changed asynchronously
+ * for slab pages, READ_ONCE() should be used here.
+ */
+ unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
+ if (memcg_data & MEMCG_DATA_OBJCGS)
+ return NULL;
+
+ return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+/*
+ * PageMemcgKmem - check if the page has MemcgKmem flag set
+ * @page: a pointer to the page struct
+ *
+ * Checks if the page has MemcgKmem flag set. The caller must ensure that
+ * the page has an associated memory cgroup. It's not safe to call this function
+ * against some types of pages, e.g. slab pages.
+ */
+static inline bool PageMemcgKmem(struct page *page)
+{
+ VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page);
+ return page->memcg_data & MEMCG_DATA_KMEM;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * page_objcgs - get the object cgroups vector associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroups vector associated with the page,
+ * or NULL. This function assumes that the page is known to have an
+ * associated object cgroups vector. It's not safe to call this function
+ * against pages, which might have an associated memory cgroup: e.g.
+ * kernel stack pages.
+ */
+static inline struct obj_cgroup **page_objcgs(struct page *page)
+{
+ unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
+ VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+
+ return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+/*
+ * page_objcgs_check - get the object cgroups vector associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroups vector associated with the page,
+ * or NULL. This function is safe to use if the page can be directly associated
+ * with a memory cgroup.
+ */
+static inline struct obj_cgroup **page_objcgs_check(struct page *page)
+{
+ unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
+ if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
+ return NULL;
+
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+
+ return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+/*
+ * set_page_objcgs - associate a page with a object cgroups vector
+ * @page: a pointer to the page struct
+ * @objcgs: a pointer to the object cgroups vector
+ *
+ * Atomically associates a page with a vector of object cgroups.
+ */
+static inline bool set_page_objcgs(struct page *page,
+ struct obj_cgroup **objcgs)
+{
+ return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs |
+ MEMCG_DATA_OBJCGS);
+}
+#else
+static inline struct obj_cgroup **page_objcgs(struct page *page)
+{
+ return NULL;
+}
+
+static inline struct obj_cgroup **page_objcgs_check(struct page *page)
+{
+ return NULL;
+}
+
+static inline bool set_page_objcgs(struct page *page,
+ struct obj_cgroup **objcgs)
+{
+ return true;
+}
+#endif
+
static __always_inline bool memcg_stat_item_in_bytes(int idx)
{
if (idx == MEMCG_PERCPU_B)
@@@ -654,12 -485,41 +654,41 @@@ out
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
+ static inline bool lruvec_holds_page_lru_lock(struct page *page,
+ struct lruvec *lruvec)
+ {
+ pg_data_t *pgdat = page_pgdat(page);
+ const struct mem_cgroup *memcg;
+ struct mem_cgroup_per_node *mz;
+
+ if (mem_cgroup_disabled())
+ return lruvec == &pgdat->__lruvec;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = page_memcg(page) ? : root_mem_cgroup;
+
+ return lruvec->pgdat == pgdat && mz->memcg == memcg;
+ }
+
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
+ struct lruvec *lock_page_lruvec(struct page *page);
+ struct lruvec *lock_page_lruvec_irq(struct page *page);
+ struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+ unsigned long *flags);
+
+ #ifdef CONFIG_DEBUG_VM
+ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+ #else
+ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+ {
+ }
+ #endif
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@@ -904,19 -764,15 +933,19 @@@ static inline void mod_memcg_state(stru
static inline void __mod_memcg_page_state(struct page *page,
int idx, int val)
{
- if (page->mem_cgroup)
- __mod_memcg_state(page->mem_cgroup, idx, val);
+ struct mem_cgroup *memcg = page_memcg(page);
+
+ if (memcg)
+ __mod_memcg_state(memcg, idx, val);
}
static inline void mod_memcg_page_state(struct page *page,
int idx, int val)
{
- if (page->mem_cgroup)
- mod_memcg_state(page->mem_cgroup, idx, val);
+ struct mem_cgroup *memcg = page_memcg(page);
+
+ if (memcg)
+ mod_memcg_state(memcg, idx, val);
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
@@@ -1002,10 -858,8 +1031,10 @@@ static inline void count_memcg_events(s
static inline void count_memcg_page_event(struct page *page,
enum vm_event_item idx)
{
- if (page->mem_cgroup)
- count_memcg_events(page->mem_cgroup, idx, 1);
+ struct mem_cgroup *memcg = page_memcg(page);
+
+ if (memcg)
+ count_memcg_events(memcg, idx, 1);
}
static inline void count_memcg_event_mm(struct mm_struct *mm,
@@@ -1074,27 -928,6 +1103,27 @@@ void mem_cgroup_split_huge_fixup(struc
struct mem_cgroup;
+static inline struct mem_cgroup *page_memcg(struct page *page)
+{
+ return NULL;
+}
+
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return NULL;
+}
+
+static inline struct mem_cgroup *page_memcg_check(struct page *page)
+{
+ return NULL;
+}
+
+static inline bool PageMemcgKmem(struct page *page)
+{
+ return false;
+}
+
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return true;
@@@ -1167,6 -1000,14 +1196,14 @@@ static inline struct lruvec *mem_cgroup
return &pgdat->__lruvec;
}
+ static inline bool lruvec_holds_page_lru_lock(struct page *page,
+ struct lruvec *lruvec)
+ {
+ pg_data_t *pgdat = page_pgdat(page);
+
+ return lruvec == &pgdat->__lruvec;
+ }
+
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
return NULL;
@@@ -1192,6 -1033,31 +1229,31 @@@ static inline void mem_cgroup_put(struc
{
}
+ static inline struct lruvec *lock_page_lruvec(struct page *page)
+ {
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock(&pgdat->__lruvec.lru_lock);
+ return &pgdat->__lruvec;
+ }
+
+ static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+ {
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock_irq(&pgdat->__lruvec.lru_lock);
+ return &pgdat->__lruvec;
+ }
+
+ static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+ unsigned long *flagsp)
+ {
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
+ return &pgdat->__lruvec;
+ }
+
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@@ -1411,6 -1277,10 +1473,10 @@@ static inlin
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
+
+ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+ {
+ }
#endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */
@@@ -1492,6 -1362,50 +1558,50 @@@ static inline struct lruvec *parent_lru
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}
+ static inline void unlock_page_lruvec(struct lruvec *lruvec)
+ {
+ spin_unlock(&lruvec->lru_lock);
+ }
+
+ static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+ {
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+ unsigned long flags)
+ {
+ spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+ }
+
+ /* Don't lock again iff page's lruvec locked */
+ static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
+ struct lruvec *locked_lruvec)
+ {
+ if (locked_lruvec) {
+ if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+ return locked_lruvec;
+
+ unlock_page_lruvec_irq(locked_lruvec);
+ }
+
+ return lock_page_lruvec_irq(page);
+ }
+
+ /* Don't lock again iff page's lruvec locked */
+ static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
+ struct lruvec *locked_lruvec, unsigned long *flags)
+ {
+ if (locked_lruvec) {
+ if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+ return locked_lruvec;
+
+ unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
+ }
+
+ return lock_page_lruvec_irqsave(page, flags);
+ }
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
@@@ -1508,7 -1422,7 +1618,7 @@@ static inline void mem_cgroup_track_for
if (mem_cgroup_disabled())
return;
- if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
+ if (unlikely(&page_memcg(page)->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
}
diff --combined include/linux/mm_types.h
index e7de072ade03,a9688cc55964..07d9acb5b19c
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@@ -79,7 -79,7 +79,7 @@@ struct page
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
- * pgdat->lru_lock. Sometimes used as a generic list
+ * lruvec->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
struct list_head lru;
@@@ -200,7 -200,10 +200,7 @@@
atomic_t _refcount;
#ifdef CONFIG_MEMCG
- union {
- struct mem_cgroup *mem_cgroup;
- struct obj_cgroup **obj_cgroups;
- };
+ unsigned long memcg_data;
#endif
/*
diff --combined include/linux/page-flags.h
index b5eb0fc15053,f8b4375ce1b3..ec5d0290e0ee
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@@ -334,6 -334,7 +334,7 @@@ PAGEFLAG(Referenced, referenced, PF_HEA
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
__CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+ TESTCLEARFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
@@@ -713,8 -714,9 +714,8 @@@ PAGEFLAG_FALSE(DoubleMap
#define PAGE_MAPCOUNT_RESERVE -128
#define PG_buddy 0x00000080
#define PG_offline 0x00000100
-#define PG_kmemcg 0x00000200
-#define PG_table 0x00000400
-#define PG_guard 0x00000800
+#define PG_table 0x00000200
+#define PG_guard 0x00000400
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@@ -765,6 -767,12 +766,6 @@@ PAGE_TYPE_OPS(Buddy, buddy
*/
PAGE_TYPE_OPS(Offline, offline)
-/*
- * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
- * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
- */
-PAGE_TYPE_OPS(Kmemcg, kmemcg)
-
/*
* Marks pages in use as page tables.
*/
diff --combined mm/huge_memory.c
index e425bbf6711a,3c4a8fc9102f..1efe2b5ad59a
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -484,7 -484,7 +484,7 @@@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, stru
#ifdef CONFIG_MEMCG
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
- struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(compound_head(page));
struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
if (memcg)
@@@ -2359,6 -2359,27 +2359,27 @@@ static void remap_page(struct page *pag
}
}
+ static void lru_add_page_tail(struct page *head, struct page *tail,
+ struct lruvec *lruvec, struct list_head *list)
+ {
+ VM_BUG_ON_PAGE(!PageHead(head), head);
+ VM_BUG_ON_PAGE(PageCompound(tail), head);
+ VM_BUG_ON_PAGE(PageLRU(tail), head);
+ lockdep_assert_held(&lruvec->lru_lock);
+
+ if (list) {
+ /* page reclaim is reclaiming a huge page */
+ VM_WARN_ON(PageLRU(head));
+ get_page(tail);
+ list_add_tail(&tail->lru, list);
+ } else {
+ /* head is still on lru (and we have it frozen) */
+ VM_WARN_ON(!PageLRU(head));
+ SetPageLRU(tail);
+ list_add_tail(&tail->lru, &head->lru);
+ }
+ }
+
static void __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
@@@ -2425,18 -2446,15 +2446,15 @@@
}
static void __split_huge_page(struct page *page, struct list_head *list,
- pgoff_t end, unsigned long flags)
+ pgoff_t end)
{
struct page *head = compound_head(page);
- pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
unsigned int nr = thp_nr_pages(head);
int i;
- lruvec = mem_cgroup_page_lruvec(head, pgdat);
-
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
@@@ -2448,6 -2466,9 +2466,9 @@@
xa_lock(&swap_cache->i_pages);
}
+ /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+ lruvec = lock_page_lruvec(head);
+
for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
@@@ -2467,6 -2488,8 +2488,8 @@@
}
ClearPageCompound(head);
+ unlock_page_lruvec(lruvec);
+ /* Caller disabled irqs, so they are still disabled here */
split_page_owner(head, nr);
@@@ -2484,8 -2507,7 +2507,7 @@@
page_ref_add(head, 2);
xa_unlock(&head->mapping->i_pages);
}
-
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ local_irq_enable();
remap_page(head, nr);
@@@ -2631,12 -2653,10 +2653,10 @@@ bool can_split_huge_page(struct page *p
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
- unsigned long flags;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@@ -2697,9 -2717,8 +2717,8 @@@
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irqsave(&pgdata->lru_lock, flags);
-
+ /* block interrupt reentry in xa_lock and spinlock */
+ local_irq_disable();
if (mapping) {
XA_STATE(xas, &mapping->i_pages, page_index(head));
@@@ -2729,7 -2748,7 +2748,7 @@@
__dec_lruvec_page_state(head, NR_FILE_THPS);
}
- __split_huge_page(page, list, end, flags);
+ __split_huge_page(page, list, end);
ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
@@@ -2743,7 -2762,7 +2762,7 @@@
spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping)
xa_unlock(&mapping->i_pages);
- spin_unlock_irqrestore(&pgdata->lru_lock, flags);
+ local_irq_enable();
remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
}
@@@ -2778,7 -2797,7 +2797,7 @@@ void deferred_split_huge_page(struct pa
{
struct deferred_split *ds_queue = get_deferred_split_queue(page);
#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(compound_head(page));
#endif
unsigned long flags;
diff --combined mm/memcontrol.c
index 9c5b14fe360e,2f7824d0c897..e3c7ca7dc174
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -20,6 -20,9 +20,9 @@@
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
+ * Per memcg lru locking
+ * Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/
#include <linux/page_counter.h>
@@@ -533,7 -536,7 +536,7 @@@ struct cgroup_subsys_state *mem_cgroup_
{
struct mem_cgroup *memcg;
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
@@@ -560,7 -563,16 +563,7 @@@ ino_t page_cgroup_ino(struct page *page
unsigned long ino = 0;
rcu_read_lock();
- memcg = page->mem_cgroup;
-
- /*
- * The lowest bit set means that memcg isn't a valid
- * memcg pointer, but a obj_cgroups pointer.
- * In this case the page is shared and doesn't belong
- * to any specific memory cgroup.
- */
- if ((unsigned long) memcg & 0x1UL)
- memcg = NULL;
+ memcg = page_memcg_check(page);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
@@@ -848,17 -860,16 +851,17 @@@ void __mod_lruvec_page_state(struct pag
int val)
{
struct page *head = compound_head(page); /* rmap on tail pages */
+ struct mem_cgroup *memcg = page_memcg(head);
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
/* Untracked pages have no memcg, no lruvec. Update only the node */
- if (!head->mem_cgroup) {
+ if (!memcg) {
__mod_node_page_state(pgdat, idx, val);
return;
}
- lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
EXPORT_SYMBOL(__mod_lruvec_page_state);
@@@ -1049,7 -1060,7 +1052,7 @@@ EXPORT_SYMBOL(get_mem_cgroup_from_mm)
*/
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(page);
if (mem_cgroup_disabled())
return NULL;
@@@ -1322,6 -1333,23 +1325,23 @@@ int mem_cgroup_scan_tasks(struct mem_cg
return ret;
}
+ #ifdef CONFIG_DEBUG_VM
+ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+ {
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ memcg = page_memcg(page);
+
+ if (!memcg)
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+ else
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
+ }
+ #endif
+
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
@@@ -1341,7 -1369,7 +1361,7 @@@ struct lruvec *mem_cgroup_page_lruvec(s
goto out;
}
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
/*
* Swapcache readahead pages are added to the LRU - and
* possibly migrated - before they are charged.
@@@ -1362,6 -1390,60 +1382,60 @@@ out
return lruvec;
}
+ /**
+ * lock_page_lruvec - lock and return lruvec for a given page.
+ * @page: the page
+ *
+ * This series functions should be used in either conditions:
+ * PageLRU is cleared or unset
+ * or page->_refcount is zero
+ * or page is locked.
+ */
+ struct lruvec *lock_page_lruvec(struct page *page)
+ {
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+ }
+
+ struct lruvec *lock_page_lruvec_irq(struct page *page)
+ {
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+ }
+
+ struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+ {
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irqsave(&lruvec->lru_lock, *flags);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+ }
+
/**
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
@@@ -2106,7 -2188,7 +2180,7 @@@ void mem_cgroup_print_oom_group(struct
}
/**
- * lock_page_memcg - lock a page->mem_cgroup binding
+ * lock_page_memcg - lock a page and memcg binding
* @page: the page
*
* This function protects unlocked LRU pages from being moved to
@@@ -2138,15 -2220,21 +2212,21 @@@ struct mem_cgroup *lock_page_memcg(stru
if (mem_cgroup_disabled())
return NULL;
again:
- memcg = head->mem_cgroup;
+ memcg = page_memcg(head);
if (unlikely(!memcg))
return NULL;
+ #ifdef CONFIG_PROVE_LOCKING
+ local_irq_save(flags);
+ might_lock(&memcg->move_lock);
+ local_irq_restore(flags);
+ #endif
+
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != head->mem_cgroup) {
+ if (memcg != page_memcg(head)) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@@@ -2184,14 -2272,14 +2264,14 @@@ void __unlock_page_memcg(struct mem_cgr
}
/**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * unlock_page_memcg - unlock a page and memcg binding
* @page: the page
*/
void unlock_page_memcg(struct page *page)
{
struct page *head = compound_head(page);
- __unlock_page_memcg(head->mem_cgroup);
+ __unlock_page_memcg(page_memcg(head));
}
EXPORT_SYMBOL(unlock_page_memcg);
@@@ -2881,7 -2969,7 +2961,7 @@@ static void cancel_charge(struct mem_cg
static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
- VM_BUG_ON_PAGE(page->mem_cgroup, page);
+ VM_BUG_ON_PAGE(page_memcg(page), page);
/*
* Any of the following ensures page's memcg stability:
*
@@@ -2890,7 -2978,7 +2970,7 @@@
* - lock_page_memcg()
* - exclusive reference
*/
- page->mem_cgroup = memcg;
+ page->memcg_data = (unsigned long)memcg;
}
#ifdef CONFIG_MEMCG_KMEM
@@@ -2905,7 -2993,8 +2985,7 @@@ int memcg_alloc_page_obj_cgroups(struc
if (!vec)
return -ENOMEM;
- if (cmpxchg(&page->obj_cgroups, NULL,
- (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ if (!set_page_objcgs(page, vec))
kfree(vec);
else
kmemleak_not_leak(vec);
@@@ -2916,12 -3005,6 +2996,12 @@@
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
+ * A passed kernel object can be a slab object or a generic kernel page, so
+ * different mechanisms for getting the memory cgroup pointer should be used.
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
* The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
* cgroup_mutex, etc.
*/
@@@ -2934,31 -3017,36 +3014,31 @@@ struct mem_cgroup *mem_cgroup_from_obj(
page = virt_to_head_page(p);
- /*
- * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
- * or a pointer to obj_cgroup vector. In the latter case the lowest
- * bit of the pointer is set.
- * The page->mem_cgroup pointer can be asynchronously changed
- * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
- * from a valid memcg pointer to objcg vector or back.
- */
- if (!page->mem_cgroup)
- return NULL;
-
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
*/
- if (page_has_obj_cgroups(page)) {
+ if (page_objcgs_check(page)) {
struct obj_cgroup *objcg;
unsigned int off;
off = obj_to_index(page->slab_cache, page, p);
- objcg = page_obj_cgroups(page)[off];
+ objcg = page_objcgs(page)[off];
if (objcg)
return obj_cgroup_memcg(objcg);
return NULL;
}
- /* All other pages use page->mem_cgroup */
- return page->mem_cgroup;
+ /*
+ * page_memcg_check() is used here, because page_has_obj_cgroups()
+ * check above could fail because the object cgroups vector wasn't set
+ * at that moment, but it can be set concurrently.
+ * page_memcg_check(page) will guarantee that a proper memory
+ * cgroup pointer or NULL will be returned.
+ */
+ return page_memcg_check(page);
}
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
@@@ -3097,8 -3185,8 +3177,8 @@@ int __memcg_kmem_charge_page(struct pag
if (memcg && !mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
- page->mem_cgroup = memcg;
- __SetPageKmemcg(page);
+ page->memcg_data = (unsigned long)memcg |
+ MEMCG_DATA_KMEM;
return 0;
}
css_put(&memcg->css);
@@@ -3113,7 -3201,7 +3193,7 @@@
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(page);
unsigned int nr_pages = 1 << order;
if (!memcg)
@@@ -3121,8 -3209,12 +3201,8 @@@
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
- page->mem_cgroup = NULL;
+ page->memcg_data = 0;
css_put(&memcg->css);
-
- /* slab pages do not have PageKmemcg flag set */
- if (PageKmemcg(page))
- __ClearPageKmemcg(page);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@@ -3263,14 -3355,12 +3343,12 @@@ void obj_cgroup_uncharge(struct obj_cgr
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
/*
- * Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * Because page_memcg(head) is not set on compound tails, set it now.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
- struct mem_cgroup *memcg = head->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(head);
int i;
if (mem_cgroup_disabled())
@@@ -3278,7 -3368,7 +3356,7 @@@
for (i = 1; i < HPAGE_PMD_NR; i++) {
css_get(&memcg->css);
- head[i].mem_cgroup = memcg;
+ head[i].memcg_data = (unsigned long)memcg;
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@@ -4619,7 -4709,7 +4697,7 @@@ void mem_cgroup_wb_stats(struct bdi_wri
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
struct bdi_writeback *wb)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(page);
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
@@@ -5580,14 -5670,14 +5658,14 @@@ static int mem_cgroup_move_account(stru
/*
* Prevent mem_cgroup_migrate() from looking at
- * page->mem_cgroup of its source page while we change it.
+ * page's memory cgroup of its source page while we change it.
*/
ret = -EBUSY;
if (!trylock_page(page))
goto out;
ret = -EINVAL;
- if (page->mem_cgroup != from)
+ if (page_memcg(page) != from)
goto out_unlock;
pgdat = page_pgdat(page);
@@@ -5642,13 -5732,13 +5720,13 @@@
/*
* All state has been migrated, let's switch to the new memcg.
*
- * It is safe to change page->mem_cgroup here because the page
+ * It is safe to change page's memcg here because the page
* is referenced, charged, isolated, and locked: we can't race
* with (un)charging, migration, LRU putback, or anything else
- * that would rely on a stable page->mem_cgroup.
+ * that would rely on a stable page's memory cgroup.
*
* Note that lock_page_memcg is a memcg lock, not a page lock,
- * to save space. As soon as we switch page->mem_cgroup to a
+ * to save space. As soon as we switch page's memory cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
*/
@@@ -5657,7 -5747,7 +5735,7 @@@
css_get(&to->css);
css_put(&from->css);
- page->mem_cgroup = to;
+ page->memcg_data = (unsigned long)to;
__unlock_page_memcg(from);
@@@ -5723,7 -5813,7 +5801,7 @@@ static enum mc_target_type get_mctgt_ty
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (page->mem_cgroup == mc.from) {
+ if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
@@@ -5767,7 -5857,7 +5845,7 @@@ static enum mc_target_type get_mctgt_ty
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!(mc.flags & MOVE_ANON))
return ret;
- if (page->mem_cgroup == mc.from) {
+ if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
@@@ -6694,12 -6784,12 +6772,12 @@@ int mem_cgroup_charge(struct page *page
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. page->mem_cgroup is protected
- * by the page lock, which serializes swap cache removal, which
- * in turn serializes uncharging.
+ * already charged pages, too. page and memcg binding is
+ * protected by the page lock, which serializes swap cache
+ * removal, which in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (compound_head(page)->mem_cgroup)
+ if (page_memcg(compound_head(page)))
goto out;
id = lookup_swap_cgroup_id(ent);
@@@ -6783,21 -6873,21 +6861,21 @@@ static void uncharge_page(struct page *
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (!page->mem_cgroup)
+ if (!page_memcg(page))
return;
/*
* Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point, we have fully
+ * page_memcg(page) at this point, we have fully
* exclusive access to the page.
*/
- if (ug->memcg != page->mem_cgroup) {
+ if (ug->memcg != page_memcg(page)) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = page->mem_cgroup;
+ ug->memcg = page_memcg(page);
/* pairs with css_put in uncharge_batch */
css_get(&ug->memcg->css);
@@@ -6806,13 -6896,15 +6884,13 @@@
nr_pages = compound_nr(page);
ug->nr_pages += nr_pages;
- if (!PageKmemcg(page)) {
- ug->pgpgout++;
- } else {
+ if (PageMemcgKmem(page))
ug->nr_kmem += nr_pages;
- __ClearPageKmemcg(page);
- }
+ else
+ ug->pgpgout++;
ug->dummy_page = page;
- page->mem_cgroup = NULL;
+ page->memcg_data = 0;
css_put(&ug->memcg->css);
}
@@@ -6855,7 -6947,7 +6933,7 @@@ void mem_cgroup_uncharge(struct page *p
return;
/* Don't touch page->lru of any random page, pre-check: */
- if (!page->mem_cgroup)
+ if (!page_memcg(page))
return;
uncharge_gather_clear(&ug);
@@@ -6905,10 -6997,10 +6983,10 @@@ void mem_cgroup_migrate(struct page *ol
return;
/* Page cache replacement: new page already charged? */
- if (newpage->mem_cgroup)
+ if (page_memcg(newpage))
return;
- memcg = oldpage->mem_cgroup;
+ memcg = page_memcg(oldpage);
if (!memcg)
return;
@@@ -7103,7 -7195,7 +7181,7 @@@ void mem_cgroup_swapout(struct page *pa
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
/* Readahead page, never charged */
if (!memcg)
@@@ -7124,7 -7216,7 +7202,7 @@@
VM_BUG_ON_PAGE(oldid, page);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- page->mem_cgroup = NULL;
+ page->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
@@@ -7167,7 -7259,7 +7245,7 @@@ int mem_cgroup_try_charge_swap(struct p
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
/* Readahead page, never charged */
if (!memcg)
@@@ -7248,7 -7340,7 +7326,7 @@@ bool mem_cgroup_swap_full(struct page *
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
if (!memcg)
return false;
diff --combined mm/page_alloc.c
index db37bf231e8b,b1cc2b7483a1..3beeb8d722f3
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -1101,7 -1101,7 +1101,7 @@@ static inline bool page_expected_state(
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page->mem_cgroup |
+ (unsigned long)page_memcg(page) |
#endif
(page->flags & check_flags)))
return false;
@@@ -1126,7 -1126,7 +1126,7 @@@ static const char *page_bad_reason(stru
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page->mem_cgroup))
+ if (unlikely(page_memcg(page)))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
@@@ -1223,7 -1223,7 +1223,7 @@@ static __always_inline bool free_pages_
* Do not let hwpoison pages hit pcplists/buddy
* Untie memcg state and reset page's owner
*/
- if (memcg_kmem_enabled() && PageKmemcg(page))
+ if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
return false;
@@@ -1253,7 -1253,7 +1253,7 @@@
}
if (PageMappingFlags(page))
page->mapping = NULL;
- if (memcg_kmem_enabled() && PageKmemcg(page))
+ if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free)
bad += check_free_page(page);
@@@ -6870,7 -6870,6 +6870,6 @@@ static void __meminit pgdat_init_intern
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
- spin_lock_init(&pgdat->lru_lock);
lruvec_init(&pgdat->__lruvec);
}
diff --combined mm/workingset.c
index 9abe51d51aa7,94b512538d5a..10e96de945b3
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@@ -257,7 -257,7 +257,7 @@@ void *workingset_eviction(struct page *
struct lruvec *lruvec;
int memcgid;
- /* Page is fully exclusive and pins page->mem_cgroup */
+ /* Page is fully exclusive and pins page's memory cgroup pointer */
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@@ -381,9 -381,7 +381,7 @@@ void workingset_refault(struct page *pa
if (workingset) {
SetPageWorkingset(page);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
- spin_lock_irq(&page_pgdat(page)->lru_lock);
lru_note_cost_page(page);
- spin_unlock_irq(&page_pgdat(page)->lru_lock);
inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
}
out:
--
LinuxNextTracking