From 8b038d52a6cb7e735aaa5f100cecb70cba382981 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Wed, 13 Jul 2022 12:19:58 +0800
Subject: [PATCH 001/132] mm/hwpoison: do not lock page again when
 me_huge_page() successfully recovers

mainline inclusion
from mainline-v5.13
commit ea6d0630100b285f059d0a8d8e86f38a46407536
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel_SIG: commit ea6d0630100b mm/hwpoison: do not lock page again when me_huge_page() successfully recovers.
Backport for MCA recovery enhancing & bug fix.

--------------------------------

Currently me_huge_page() temporary unlocks page to perform some actions
then locks it again later.  My testcase (which calls hard-offline on
some tail page in a hugetlb, then accesses the address of the hugetlb
range) showed that page allocation code detects this page lock on buddy
page and printed out "BUG: Bad page state" message.

check_new_page_bad() does not consider a page with __PG_HWPOISON as bad
page, so this flag works as kind of filter, but this filtering doesn't
work in this case because the "bad page" is not the actual hwpoisoned
page.  So stop locking page again.  Actions to be taken depend on the
page type of the error, so page unlocking should be done in ->action()
callbacks.  So let's make it assumed and change all existing callbacks
that way.

Link: https://lkml.kernel.org/r/20210609072029.74645-1-nao.horiguchi@gmail.com
Fixes: commit 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 mm/memory-failure.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0519f20d2b57..74a21042845a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
  */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
+	unlock_page(p);
 	return MF_IGNORED;
 }
 
@@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
 	pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+	unlock_page(p);
 	return MF_FAILED;
 }
 
@@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
  */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
+	int ret;
 	struct address_space *mapping;
 
 	delete_from_lru_cache(p);
@@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 * For anonymous pages we're done the only reference left
 	 * should be the one m_f() holds.
 	 */
-	if (PageAnon(p))
-		return MF_RECOVERED;
+	if (PageAnon(p)) {
+		ret = MF_RECOVERED;
+		goto out;
+	}
 
 	/*
 	 * Now truncate the page in the page cache. This is really
@@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 		/*
 		 * Page has been teared down in the meanwhile
 		 */
-		return MF_FAILED;
+		ret = MF_FAILED;
+		goto out;
 	}
 
 	/*
@@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 *
 	 * Open: to take i_mutex or not for this? Right now we don't.
 	 */
-	return truncate_error_page(p, pfn, mapping);
+	ret = truncate_error_page(p, pfn, mapping);
+out:
+	unlock_page(p);
+	return ret;
 }
 
 /*
@@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
+	int ret;
+
 	ClearPageDirty(p);
 	/* Trigger EIO in shmem: */
 	ClearPageUptodate(p);
 
-	if (!delete_from_lru_cache(p))
-		return MF_DELAYED;
-	else
-		return MF_FAILED;
+	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+	unlock_page(p);
+	return ret;
 }
 
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
+	int ret;
+
 	delete_from_swap_cache(p);
 
-	if (!delete_from_lru_cache(p))
-		return MF_RECOVERED;
-	else
-		return MF_FAILED;
+	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+	unlock_page(p);
+	return ret;
 }
 
 /*
@@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 	mapping = page_mapping(hpage);
 	if (mapping) {
 		res = truncate_error_page(hpage, pfn, mapping);
+		unlock_page(hpage);
 	} else {
 		res = MF_FAILED;
 		unlock_page(hpage);
@@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 			page_ref_inc(p);
 			res = MF_RECOVERED;
 		}
-		lock_page(hpage);
 	}
 
 	return res;
@@ -866,6 +877,8 @@ static struct page_state {
 	unsigned long mask;
 	unsigned long res;
 	enum mf_action_page_type type;
+
+	/* Callback ->action() has to unlock the relevant page inside it. */
 	int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
 	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
@@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
 	int result;
 	int count;
 
+	/* page p should be unlocked after returning from ps->action().  */
 	result = ps->action(p, pfn);
 
 	count = page_count(p) - 1;
@@ -1246,7 +1260,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 		goto out;
 	}
 
-	res = identify_page_state(pfn, p, page_flags);
+	return identify_page_state(pfn, p, page_flags);
 out:
 	unlock_page(head);
 	return res;
@@ -1529,6 +1543,8 @@ int memory_failure(unsigned long pfn, int flags)
 
 identify_page_state:
 	res = identify_page_state(pfn, p, page_flags);
+	mutex_unlock(&mf_mutex);
+	return res;
 unlock_page:
 	unlock_page(p);
 unlock_mutex:
-- 
Gitee


From 75a005c5418abcb879bdf3053d0853956cec7e73 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Mon, 28 Jun 2021 19:43:14 -0700
Subject: [PATCH 002/132] mm,hwpoison: send SIGBUS with error virutal address

mainline inclusion
from mainline-v5.14-rc1
commit a3f5d80ea401ac857f2910e28b15f35b2cf902f4
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit a3f5d80ea401 mm,hwpoison: send SIGBUS with error virutal address.
Backport for MCA recovery enhancing & bug fix.

--------------------------------

Now an action required MCE in already hwpoisoned address surely sends a
SIGBUS to current process, but the SIGBUS doesn't convey error virtual
address.  That's not optimal for hwpoison-aware applications.

To fix the issue, make memory_failure() call kill_accessing_process(),
that does pagetable walk to find the error virtual address.  It could find
multiple virtual addresses for the same error page, and it seems hard to
tell which virtual address is correct one.  But that's rare and sending
incorrect virtual address could be better than no address.  So let's
report the first found virtual address for now.

[naoya.horiguchi@nec.com: fix walk_page_range() return]
  Link: https://lkml.kernel.org/r/20210603051055.GA244241@hori.linux.bs1.fc.nec.co.jp
Link: https://lkml.kernel.org/r/20210521030156.2612074-4-nao.horiguchi@gmail.com
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Jue Wang <juew@google.com>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c |  13 ++-
 include/linux/swapops.h        |   5 ++
 mm/memory-failure.c            | 150 ++++++++++++++++++++++++++++++++-
 3 files changed, 165 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0ced8b250090..cdda2198deae 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1271,6 +1271,7 @@ static void kill_me_maybe(struct callback_head *cb)
 {
 	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
 	int flags = MF_ACTION_REQUIRED;
+	int ret;
 
 	p->mce_count = 0;
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
@@ -1278,13 +1279,21 @@ static void kill_me_maybe(struct callback_head *cb)
 	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;
 
-	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
-	    !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+	if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		sync_core();
 		return;
 	}
 
+	/*
+	 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+	 * to the current process with the proper error info, so no need to
+	 * send SIGBUS here again.
+	 */
+	if (ret == -EHWPOISON)
+		return;
+
 	if (p->mce_vaddr != (void __user *)-1l) {
 		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
 	} else {
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 0d429a102d41..708fbeb21dd3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -332,6 +332,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
+static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 74a21042845a..0035e8ca4e49 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -554,6 +555,148 @@ void collect_procs(struct page *page, struct list_head *tokill,
 }
 EXPORT_SYMBOL_GPL(collect_procs);
 
+struct hwp_walk {
+	struct to_kill tk;
+	unsigned long pfn;
+	int flags;
+};
+
+static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+	tk->addr = addr;
+	tk->size_shift = shift;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+				unsigned long poisoned_pfn, struct to_kill *tk)
+{
+	unsigned long pfn = 0;
+
+	if (pte_present(pte)) {
+		pfn = pte_pfn(pte);
+	} else {
+		swp_entry_t swp = pte_to_swp_entry(pte);
+
+		if (is_hwpoison_entry(swp))
+			pfn = hwpoison_entry_to_pfn(swp);
+	}
+
+	if (!pfn || pfn != poisoned_pfn)
+		return 0;
+
+	set_to_kill(tk, addr, shift);
+	return 1;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	pmd_t pmd = *pmdp;
+	unsigned long pfn;
+	unsigned long hwpoison_vaddr;
+
+	if (!pmd_present(pmd))
+		return 0;
+	pfn = pmd_pfn(pmd);
+	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
+		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
+		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
+		return 1;
+	}
+	return 0;
+}
+#else
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	return 0;
+}
+#endif
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+			      unsigned long end, struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	int ret = 0;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+	if (ptl) {
+		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
+		spin_unlock(ptl);
+		goto out;
+	}
+
+	if (pmd_trans_unstable(pmdp))
+		goto out;
+
+	ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+	for (; addr != end; ptep++, addr += PAGE_SIZE) {
+		ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+					     hwp->pfn, &hwp->tk);
+		if (ret == 1)
+			break;
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+out:
+	cond_resched();
+	return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+			    unsigned long addr, unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	pte_t pte = huge_ptep_get(ptep);
+	struct hstate *h = hstate_vma(walk->vma);
+
+	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+				      hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range	NULL
+#endif
+
+static struct mm_walk_ops hwp_walk_ops = {
+	.pmd_entry = hwpoison_pte_range,
+	.hugetlb_entry = hwpoison_hugetlb_range,
+};
+
+/*
+ * Sends SIGBUS to the current process with error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple local machine checks happened on different CPUs.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. The returned virtual address
+ * is proper in most cases, but it could be wrong when the application
+ * process has multiple entries mapping the error page.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+				  int flags)
+{
+	int ret;
+	struct hwp_walk priv = {
+		.pfn = pfn,
+	};
+	priv.tk.tsk = p;
+
+	mmap_read_lock(p->mm);
+	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
+			      (void *)&priv);
+	if (ret == 1 && priv.tk.addr)
+		kill_proc(&priv.tk, pfn, flags);
+	mmap_read_unlock(p->mm);
+	return ret ? -EFAULT : -EHWPOISON;
+}
+
 static const char *action_name[] = {
 	[MF_IGNORED] = "Ignored",
 	[MF_FAILED] = "Failed",
@@ -1204,7 +1347,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(head)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 		       pfn);
-		return -EHWPOISON;
+		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, page_to_pfn(head), flags);
+		return res;
 	}
 
 	num_poisoned_pages_inc();
@@ -1409,6 +1555,8 @@ int memory_failure(unsigned long pfn, int flags)
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
 		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, pfn, flags);
 		goto unlock_mutex;
 	}
 
-- 
Gitee


From d1b9413b3213dc72557fc4cdb01ee3ab18d716f4 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 17 Aug 2021 17:29:41 -0700
Subject: [PATCH 003/132] x86/mce: Change to not send SIGBUS error during copy
 from user

mainline inclusion
from mainline-v5.16-rc1
commit a6e3cf70b772541c2388abdb86e5a562cfe18e63
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit a6e3cf70b772 x86/mce: Change to not send SIGBUS
error during copy from user.
Backport for MCA recovery enhance and bug fix.

--------------------------------

Sending a SIGBUS for a copy from user is not the correct semantic.
System calls should return -EFAULT (or a short count for write(2)).

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210818002942.1607544-3-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c | 36 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index cdda2198deae..8aeb56f5e577 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1280,7 +1280,7 @@ static void kill_me_maybe(struct callback_head *cb)
 		flags |= MF_MUST_KILL;
 
 	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
-	if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	if (!ret) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		sync_core();
 		return;
@@ -1294,15 +1294,21 @@ static void kill_me_maybe(struct callback_head *cb)
 	if (ret == -EHWPOISON)
 		return;
 
-	if (p->mce_vaddr != (void __user *)-1l) {
-		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
-	} else {
-		pr_err("Memory error not recovered");
-		kill_me_now(cb);
-	}
+	pr_err("Memory error not recovered");
+	kill_me_now(cb);
 }
 
-static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
+static void kill_me_never(struct callback_head *cb)
+{
+	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+
+	p->mce_count = 0;
+	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
+		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+}
+
+static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
 {
 	int count = ++current->mce_count;
 
@@ -1312,11 +1318,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
 		current->mce_kflags = m->kflags;
 		current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
 		current->mce_whole_page = whole_page(m);
-
-		if (kill_current_task)
-			current->mce_kill_me.func = kill_me_now;
-		else
-			current->mce_kill_me.func = kill_me_maybe;
+		current->mce_kill_me.func = func;
 	}
 
 	/* Ten is likely overkill. Don't expect more than two faults before task_work() */
@@ -1486,8 +1488,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
 		/* If this triggers there is no way to recover. Die hard. */
 		BUG_ON(!on_thread_stack() || !user_mode(regs));
 
-		queue_task_work(&m, msg, kill_it);
-
+		if (kill_it)
+			queue_task_work(&m, msg, kill_me_now);
+		else
+			queue_task_work(&m, msg, kill_me_maybe);
 	} else {
 		/*
 		 * Handle an MCE which has happened in kernel space but from
@@ -1504,7 +1508,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
 		}
 
 		if (m.kflags & MCE_IN_KERNEL_COPYIN)
-			queue_task_work(&m, msg, kill_it);
+			queue_task_work(&m, msg, kill_me_never);
 	}
 
 	instrumentation_end();
-- 
Gitee


From 4bc1efa1413b9bdfaa10be0f049f19e2a2266253 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 31 May 2021 00:32:44 -0400
Subject: [PATCH 004/132] generic_perform_write()/iomap_write_actor(): saner
 logics for short copy

mainline inclusion
from mainline-v5.13
commit bc1bb416bbb9203e250f5c49aaf1d11b5d9c8adb
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit bc1bb416bbb9 generic_perform_write()/iomap_write_actor():
saner logics for short copy.
Backport for MCA recovery enhance and bug fix.

--------------------------------

if we run into a short copy and ->write_end() refuses to advance at all,
use the amount we'd managed to copy for the next iteration to handle.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 fs/iomap/buffered-io.c | 25 ++++++++++---------------
 mm/filemap.c           | 24 +++++++++---------------
 2 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3ec494f5d7ee..d5246e277f17 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -771,10 +771,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@@ -791,25 +787,24 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
 
-		copied = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+		status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
 				srcmap);
 
 		cond_resched();
 
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
+		if (unlikely(status == 0)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * A short copy made iomap_write_end() reject the
+			 * thing entirely.  Might be memory poisoning
+			 * halfway through, might be a race with munmap,
+			 * might be severe memory pressure.
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
+			if (copied)
+				bytes = copied;
 			goto again;
 		}
+		copied = status;
+		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 		length -= copied;
diff --git a/mm/filemap.c b/mm/filemap.c
index 3958fc3280d8..80c5c418eba8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3510,10 +3510,6 @@ ssize_t generic_perform_write(struct file *file,
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@@ -3540,24 +3536,22 @@ ssize_t generic_perform_write(struct file *file,
 						page, fsdata);
 		if (unlikely(status < 0))
 			break;
-		copied = status;
 
 		cond_resched();
 
-		iov_iter_advance(i, copied);
-		if (unlikely(copied == 0)) {
+		if (unlikely(status == 0)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * A short copy made ->write_end() reject the
+			 * thing entirely.  Might be memory poisoning
+			 * halfway through, might be a race with munmap,
+			 * might be severe memory pressure.
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_single_seg_count(i));
+			if (copied)
+				bytes = copied;
 			goto again;
 		}
+		copied = status;
+		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 
-- 
Gitee


From a1728f8de83520f93310fc7409c502358ca46fdd Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 17 Aug 2021 17:29:42 -0700
Subject: [PATCH 005/132] x86/mce: Drop copyin special case for #MC

mainline inclusion
from mainline-v5.16-rc1
commit 690658471b5f28d306e6492c4585d748cb5304e8
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 690658471b5f x86/mce: Drop copyin special case for #MC.
Backport for MCA recovery enhancing & bug fix.

--------------------------------

Fixes to the iterator code to handle faults that are not on page
boundaries mean that the special case for machine check during copy from
user is no longer needed.

For a full list of those fixes, see the output of:

  git log --oneline v5.14 ^v5.13 -- lib/iov_iter.c

Intel-SIG: commit 690658471b5f x86/mce: Drop copyin special case for #MC.
backport for MCA recovery enhance and bug fix.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210818002942.1607544-4-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/lib/copy_user_64.S | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 77b9b2a3b5c8..e0e71ca023ce 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -234,24 +234,11 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  */
 SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
 	movl %edx,%ecx
-	cmp $X86_TRAP_MC,%eax		/* check if X86_TRAP_MC */
-	je 3f
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	ret
 
-	/*
-	 * Return zero to pretend that this copy succeeded. This
-	 * is counter-intuitive, but needed to prevent the code
-	 * in lib/iov_iter.c from retrying and running back into
-	 * the poison cache line again. The machine check handler
-	 * will ensure that a SIGBUS is sent to the task.
-	 */
-3:	xorl %eax,%eax
-	ASM_CLAC
-	ret
-
 	_ASM_EXTABLE_CPY(1b, 2b)
 SYM_CODE_END(.Lcopy_user_handle_tail)
 
-- 
Gitee


From 3ea63cb74c8cf3446e0caf8c32682ae92d31a58f Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Thu, 23 Dec 2021 12:07:01 -0800
Subject: [PATCH 006/132] x86/mce: Reduce number of machine checks taken during
 recovery

mainline inclusion
from mainline-v5.17-rc1
commit 3376136300a00df9a864b88fa969177d6c3be8e5
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 3376136300a0 x86/mce: Reduce number of machine checks taken during
 recovery.
Backport for MCA recovery enhancing & bug fix.

--------------------------------

When any of the copy functions in arch/x86/lib/copy_user_64.S take a
fault, the fixup code copies the remaining byte count from %ecx to %edx
and unconditionally jumps to .Lcopy_user_handle_tail to continue the
copy in case any more bytes can be copied.

If the fault was #PF this may copy more bytes (because the page fault
handler might have fixed the fault). But when the fault is a machine
check the original copy code will have copied all the way to the poisoned
cache line. So .Lcopy_user_handle_tail will just take another machine
check for no good reason.

Every code path to .Lcopy_user_handle_tail comes from an exception fixup
path, so add a check there to check the trap type (in %eax) and simply
return the count of remaining bytes if the trap was a machine check.

Doing this reduces the number of machine checks taken during synthetic
tests from four to three.

As well as reducing the number of machine checks, this also allows
Skylake generation Xeons to recover some cases that currently fail. The
is because REP; MOVSB is only recoverable when source and destination
are well aligned and the byte count is large. That useless call to
.Lcopy_user_handle_tail may violate one or more of these conditions and
generate a fatal machine check.

  [ Tony: Add more details to commit message. ]
  [ bp: Fixup comment.
    Also, another tip patchset which is adding straight-line speculation
    mitigation changes the "ret" instruction to an all-caps macro "RET".
    But, since gas is case-insensitive, use "RET" in the newly added asm block
    already in order to simplify tip branch merging on its way upstream.
  ]

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/YcTW5dh8yTGucDd+@agluck-desk2.amr.corp.intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 arch/x86/lib/copy_user_64.S | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index e0e71ca023ce..403ba3eb4b84 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -225,6 +225,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  * Don't try to copy the tail if machine check happened
  *
  * Input:
+ * eax trap number written by ex_handler_copy()
  * rdi destination
  * rsi source
  * rdx count
@@ -233,12 +234,20 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  * eax uncopied bytes or 0 if successful.
  */
 SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
+	cmp $X86_TRAP_MC,%eax
+	je 3f
+
 	movl %edx,%ecx
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	ret
 
+3:
+	movl %edx,%eax
+	ASM_CLAC
+	RET
+
 	_ASM_EXTABLE_CPY(1b, 2b)
 SYM_CODE_END(.Lcopy_user_handle_tail)
 
-- 
Gitee


From 297f05904ec5352fa767b706f70b7bf9d88ab986 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Sun, 6 Feb 2022 15:10:14 +0800
Subject: [PATCH 007/132] mm/hwpoison: fix error page recovered but reported
 "not recovered"

mainline inclusion
from mainline-v5.18-rc1
commit 046545a661af2beec21de7b90ca0e35f05088a81
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 046545a661af mm/hwpoison: fix error page recovered but reported "not
 recovered".
Backport for MCA recovery enhancing & bug fix.

--------------------------------

When an uncorrected memory error is consumed there is a race between the
CMCI from the memory controller reporting an uncorrected error with a
UCNA signature, and the core reporting and SRAR signature machine check
when the data is about to be consumed.

If the CMCI wins that race, the page is marked poisoned when
uc_decode_notifier() calls memory_failure() and the machine check
processing code finds the page already poisoned.  It calls
kill_accessing_process() to make sure a SIGBUS is sent.  But returns the
wrong error code.

Console log looks like this:

  mce: Uncorrected hardware memory error in user-access at 3710b3400
  Memory failure: 0x3710b3: recovery action for dirty LRU page: Recovered
  Memory failure: 0x3710b3: already hardware poisoned
  Memory failure: 0x3710b3: Sending SIGBUS to einj_mem_uc:361438 due to hardware memory corruption
  mce: Memory error not recovered

kill_accessing_process() is supposed to return -EHWPOISON to notify that
SIGBUS is already set to the process and kill_me_maybe() doesn't have to
send it again.  But current code simply fails to do this, so fix it to
make sure to work as intended.  This change avoids the noise message
"Memory error not recovered" and skips duplicate SIGBUSs.

[tony.luck@intel.com: reword some parts of commit message]

Link: https://lkml.kernel.org/r/20220113231117.1021405-1-naoya.horiguchi@linux.dev
Fixes: a3f5d80ea401 ("mm,hwpoison: send SIGBUS with error virutal address")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: Youquan Song <youquan.song@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 mm/memory-failure.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0035e8ca4e49..7c7769ff0f1d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -693,8 +693,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
 			      (void *)&priv);
 	if (ret == 1 && priv.tk.addr)
 		kill_proc(&priv.tk, pfn, flags);
+	else
+		ret = 0;
 	mmap_read_unlock(p->mm);
-	return ret ? -EFAULT : -EHWPOISON;
+	return ret > 0 ? -EHWPOISON : -EFAULT;
 }
 
 static const char *action_name[] = {
-- 
Gitee


From 928722d827c692d23c11f40980c7edc536a58cd9 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 17 Nov 2020 20:49:52 +0800
Subject: [PATCH 008/132] EDAC: Add DDR5 new memory type

mainline inclusion
from mainline-v5.13
commit bc1c99a5971aa7571e8b9731c28fa32abe12cab8
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel_SIG: commit bc1c99a5971a EDAC: Add DDR5 new memory type.
Backport for EDAC enhancing & bug fix.

--------------------------------

Add a new entry to 'enum mem_type' and a new string to
'edac_mem_types[]' for DDR5 new memory type.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/edac_mc.c | 1 +
 include/linux/edac.h   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index f4eb071327be..34514c638f19 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -161,6 +161,7 @@ const char * const edac_mem_types[] = {
 	[MEM_DDR4]	= "Unbuffered-DDR4",
 	[MEM_RDDR4]	= "Registered-DDR4",
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
+	[MEM_DDR5]	= "Unbuffered-DDR5",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 15e8f3d8a895..6c4565cc6273 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -179,6 +179,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_RDDR4:		Registered DDR4 RAM
  *			This is a variant of the DDR4 memories.
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
+ * @MEM_DDR5:		Unbuffered DDR5 RAM
  * @MEM_NVDIMM:		Non-volatile RAM
  */
 enum mem_type {
@@ -203,6 +204,7 @@ enum mem_type {
 	MEM_DDR4,
 	MEM_RDDR4,
 	MEM_LRDDR4,
+	MEM_DDR5,
 	MEM_NVDIMM,
 };
 
@@ -226,6 +228,7 @@ enum mem_type {
 #define MEM_FLAG_DDR4           BIT(MEM_DDR4)
 #define MEM_FLAG_RDDR4          BIT(MEM_RDDR4)
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
+#define MEM_FLAG_DDR5           BIT(MEM_DDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
 
 /**
-- 
Gitee


From 432fad27ed99567be07b4ebfbb7aa50b8856c83e Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 17 Nov 2020 20:49:53 +0800
Subject: [PATCH 009/132] EDAC/i10nm: Add Intel Sapphire Rapids server support

mainline inclusion
from mainline-v5.11-rc1
commit 479f58dda25bb46daeb937f124718e8b4aea6781
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel_SIG: commit 479f58dda25b EDAC/i10nm: Add Intel Sapphire
Rapids server support.
Backport for add EDAC SPR suppporting.

--------------------------------

The Sapphire Rapids CPU model shares the same memory controller
architecture with Ice Lake server. There are some configurations
different from Ice Lake server as below:
- The device ID for configuration agent.
- The size for per channel memory-mapped I/O.
- The DDR5 memory support.
So add the above configurations and the Sapphire Rapids CPU model
ID for EDAC support.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 34 +++++++++++++++++++++++++---------
 drivers/edac/skx_base.c   |  6 +++---
 drivers/edac/skx_common.c | 23 ++++++++++++++++++-----
 drivers/edac/skx_common.h | 16 ++++++++++++----
 4 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 3a7362f968c9..4f7f9970a901 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -13,7 +13,7 @@
 #include "edac_module.h"
 #include "skx_common.h"
 
-#define I10NM_REVISION	"v0.0.3"
+#define I10NM_REVISION	"v0.0.4"
 #define EDAC_MOD_STR	"i10nm_edac"
 
 /* Debug macros */
@@ -25,11 +25,13 @@
 #define I10NM_GET_IMC_BAR(d, i, reg)	\
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
-	readl((m)->mbase + 0x2080c + (i) * 0x4000 + (j) * 4)
+	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
-	readl((m)->mbase + 0x20970 + (i) * 0x4000)
+	readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz)
 #define I10NM_GET_MCMTR(m, i)		\
-	readl((m)->mbase + 0x20ef8 + (i) * 0x4000)
+	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
+#define I10NM_GET_AMAP(m, i)		\
+	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
@@ -129,12 +131,22 @@ static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
+	.ddr_chan_mmio_sz	= 0x4000,
 };
 
 static struct res_config i10nm_cfg1 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
+	.ddr_chan_mmio_sz	= 0x4000,
+};
+
+static struct res_config spr_cfg = {
+	.type			= SPR,
+	.decs_did		= 0x3252,
+	.busno_cfg_offset	= 0xd0,
+	.ddr_chan_mmio_sz	= 0x8000,
+	.support_ddr5		= true,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
@@ -143,6 +155,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = {
 	X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X,		X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0),
 	X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X,		X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1),
 	X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_D,		X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1),
+	X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X,	X86_STEPPINGS(0x0, 0xf), &spr_cfg),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids);
@@ -157,12 +170,13 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
-static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
+static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
+				 struct res_config *cfg)
 {
 	struct skx_pvt *pvt = mci->pvt_info;
 	struct skx_imc *imc = pvt->imc;
+	u32 mtr, amap, mcddrtcfg;
 	struct dimm_info *dimm;
-	u32 mtr, mcddrtcfg;
 	int i, j, ndimms;
 
 	for (i = 0; i < I10NM_NUM_CHANNELS; i++) {
@@ -171,6 +185,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
 
 		ndimms = 0;
 		mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i);
+		amap = I10NM_GET_AMAP(imc, i);
 		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
 			dimm = edac_get_dimm(mci, i, j, 0);
 			mtr = I10NM_GET_DIMMMTR(imc, i, j);
@@ -178,8 +193,8 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci)
 				 mtr, mcddrtcfg, imc->mc, i, j);
 
 			if (IS_DIMM_PRESENT(mtr))
-				ndimms += skx_get_dimm_info(mtr, 0, 0, dimm,
-							    imc, i, j);
+				ndimms += skx_get_dimm_info(mtr, 0, amap, dimm,
+							    imc, i, j, cfg);
 			else if (IS_NVDIMM_PRESENT(mcddrtcfg, j))
 				ndimms += skx_get_nvdimm_info(dimm, imc, i, j,
 							      EDAC_MOD_STR);
@@ -306,10 +321,11 @@ static int __init i10nm_init(void)
 			d->imc[i].lmc = i;
 			d->imc[i].src_id  = src_id;
 			d->imc[i].node_id = node_id;
+			d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
 
 			rc = skx_register_mci(&d->imc[i], d->imc[i].mdev,
 					      "Intel_10nm Socket", EDAC_MOD_STR,
-					      i10nm_get_dimm_config);
+					      i10nm_get_dimm_config, cfg);
 			if (rc < 0)
 				goto fail;
 		}
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index f887e3166651..4dbd46575bfb 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -174,7 +174,7 @@ static bool skx_check_ecc(u32 mcmtr)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
-static int skx_get_dimm_config(struct mem_ctl_info *mci)
+static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 {
 	struct skx_pvt *pvt = mci->pvt_info;
 	u32 mtr, mcmtr, amap, mcddrtcfg;
@@ -195,7 +195,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
 			pci_read_config_dword(imc->chan[i].cdev,
 					      0x80 + 4 * j, &mtr);
 			if (IS_DIMM_PRESENT(mtr)) {
-				ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j);
+				ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j, cfg);
 			} else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) {
 				ndimms += skx_get_nvdimm_info(dimm, imc, i, j,
 							      EDAC_MOD_STR);
@@ -705,7 +705,7 @@ static int __init skx_init(void)
 			d->imc[i].node_id = node_id;
 			rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev,
 					      "Skylake Socket", EDAC_MOD_STR,
-					      skx_get_dimm_config);
+					      skx_get_dimm_config, cfg);
 			if (rc < 0)
 				goto fail;
 		}
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 2b4ce8e5ac2f..81c3e2ec6f56 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -304,15 +304,25 @@ static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add,
 #define numcol(reg)	skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols")
 
 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
-		      struct skx_imc *imc, int chan, int dimmno)
+		      struct skx_imc *imc, int chan, int dimmno,
+		      struct res_config *cfg)
 {
-	int  banks = 16, ranks, rows, cols, npages;
+	int  banks, ranks, rows, cols, npages;
+	enum mem_type mtype;
 	u64 size;
 
 	ranks = numrank(mtr);
 	rows = numrow(mtr);
 	cols = numcol(mtr);
 
+	if (cfg->support_ddr5 && (amap & 0x8)) {
+		banks = 32;
+		mtype = MEM_DDR5;
+	} else {
+		banks = 16;
+		mtype = MEM_DDR4;
+	}
+
 	/*
 	 * Compute size in 8-byte (2^3) words, then shift to MiB (2^20)
 	 */
@@ -332,7 +342,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	dimm->nr_pages = npages;
 	dimm->grain = 32;
 	dimm->dtype = get_width(mtr);
-	dimm->mtype = MEM_DDR4;
+	dimm->mtype = mtype;
 	dimm->edac_mode = EDAC_SECDED; /* likely better than this */
 	snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
 		 imc->src_id, imc->lmc, chan, dimmno);
@@ -390,7 +400,8 @@ int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 
 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 		     const char *ctl_name, const char *mod_str,
-		     get_dimm_config_f get_dimm_config)
+		     get_dimm_config_f get_dimm_config,
+		     struct res_config *cfg)
 {
 	struct mem_ctl_info *mci;
 	struct edac_mc_layer layers[2];
@@ -425,13 +436,15 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 	}
 
 	mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM;
+	if (cfg->support_ddr5)
+		mci->mtype_cap |= MEM_FLAG_DDR5;
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = mod_str;
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
 
-	rc = get_dimm_config(mci);
+	rc = get_dimm_config(mci, cfg);
 	if (rc < 0)
 		goto fail;
 
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 78f8c1de0b71..bf56bebff138 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -59,6 +59,7 @@ struct skx_dev {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
 		void __iomem *mbase;  /* for i10nm CPU */
+		int chan_mmio_sz;     /* for i10nm CPU */
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id, node_id;
@@ -82,7 +83,8 @@ struct skx_pvt {
 
 enum type {
 	SKX,
-	I10NM
+	I10NM,
+	SPR
 };
 
 enum {
@@ -118,9 +120,13 @@ struct res_config {
 	unsigned int decs_did;
 	/* Default bus number configuration register offset */
 	int busno_cfg_offset;
+	/* Per DDR channel memory-mapped I/O size */
+	int ddr_chan_mmio_sz;
+	bool support_ddr5;
 };
 
-typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci);
+typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
+				 struct res_config *cfg);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
 typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
 
@@ -136,14 +142,16 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list);
 int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm);
 
 int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
-		      struct skx_imc *imc, int chan, int dimmno);
+		      struct skx_imc *imc, int chan, int dimmno,
+		      struct res_config *cfg);
 
 int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 			int chan, int dimmno, const char *mod_str);
 
 int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 		     const char *ctl_name, const char *mod_str,
-		     get_dimm_config_f get_dimm_config);
+		     get_dimm_config_f get_dimm_config,
+		     struct res_config *cfg);
 
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data);
-- 
Gitee


From 3dd707aa7c8e5d5cda72390ee0d22508944bdb77 Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Wed, 18 Aug 2021 10:57:01 -0700
Subject: [PATCH 010/132] EDAC/i10nm: Retrieve and print retry_rd_err_log
 registers

mainline inclusion
from mainline-v5.15-rc1
commit cf4e6d52f58399c777276172ec250502e19d5e63
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit cf4e6d52f583 EDAC/i10nm: Retrieve and print
retry_rd_err_log registers.
Backport for EDAC retry_rd_err_log for ICX/SPR support.

--------------------------------

Retrieve and print retry_rd_err_log registers like the earlier change:
commit e80634a75aba ("EDAC, skx: Retrieve and print retry_rd_err_log registers")

This is a little trickier than on Skylake because of potential
interference with BIOS use of the same registers. The default
behavior is to ignore these registers.

A module parameter retry_rd_err_log(default=0) controls the mode of operation:
- 0=off  : Default.
- 1=bios : Linux doesn't reset any control bits, but just reports values.
           This is "no harm" mode, but it may miss reporting some data.
- 2=linux: Linux tries to take control and resets mode bits,
           clears valid/UC bits after reading. This should be
           more reliable (especially if BIOS interference is reduced
           by disabling eMCA reporting mode in BIOS setup).

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210818175701.1611513-3-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 146 ++++++++++++++++++++++++++++++++++++++
 drivers/edac/skx_base.c   |   3 +-
 drivers/edac/skx_common.c |   4 +-
 drivers/edac/skx_common.h |   7 +-
 4 files changed, 157 insertions(+), 3 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 4f7f9970a901..7798de1e2aed 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -32,14 +32,137 @@
 	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
 #define I10NM_GET_AMAP(m, i)		\
 	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
+#define I10NM_GET_REG32(m, i, offset)	\
+	readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_GET_REG64(m, i, offset)	\
+	readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_SET_REG32(m, i, offset, v)	\
+	writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
 
+#define RETRY_RD_ERR_LOG_UC		BIT(1)
+#define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
+#define RETRY_RD_ERR_LOG_EN		BIT(15)
+#define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
+#define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
+
 static struct list_head *i10nm_edac_list;
 
+static struct res_config *res_cfg;
+static int retry_rd_err_log;
+
+static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+
+static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
+{
+	u32 s, d;
+
+	if (!imc->mbase)
+		return;
+
+	s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
+	d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
+
+	if (enable) {
+		/* Save default configurations */
+		imc->chan[chan].retry_rd_err_log_s = s;
+		imc->chan[chan].retry_rd_err_log_d = d;
+
+		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		s |=  RETRY_RD_ERR_LOG_EN;
+		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		d |=  RETRY_RD_ERR_LOG_EN;
+	} else {
+		/* Restore default configurations */
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
+			s |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER)
+			s |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN))
+			s &= ~RETRY_RD_ERR_LOG_EN;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC)
+			d |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER)
+			d |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
+			d &= ~RETRY_RD_ERR_LOG_EN;
+	}
+
+	I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
+	I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
+}
+
+static void enable_retry_rd_err_log(bool enable)
+{
+	struct skx_dev *d;
+	int i, j;
+
+	edac_dbg(2, "\n");
+
+	list_for_each_entry(d, i10nm_edac_list, list)
+		for (i = 0; i < I10NM_NUM_IMC; i++)
+			for (j = 0; j < I10NM_NUM_CHANNELS; j++)
+				__enable_retry_rd_err_log(&d->imc[i], j, enable);
+}
+
+static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
+				  int len, bool scrub_err)
+{
+	struct skx_imc *imc = &res->dev->imc[res->imc];
+	u32 log0, log1, log2, log3, log4;
+	u32 corr0, corr1, corr2, corr3;
+	u64 log2a, log5;
+	u32 *offsets;
+	int n;
+
+	if (!imc->mbase)
+		return;
+
+	offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
+
+	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
+	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
+	log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]);
+	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
+	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
+
+	if (res_cfg->type == SPR) {
+		log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]",
+			     log0, log1, log2a, log3, log4, log5);
+	} else {
+		log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
+			     log0, log1, log2, log3, log4, log5);
+	}
+
+	corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
+	corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
+	corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
+	corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+
+	if (len - n > 0)
+		snprintf(msg + n, len - n,
+			 " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]",
+			 corr0 & 0xffff, corr0 >> 16,
+			 corr1 & 0xffff, corr1 >> 16,
+			 corr2 & 0xffff, corr2 >> 16,
+			 corr3 & 0xffff, corr3 >> 16);
+
+	/* Clear status bits */
+	if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
+		log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+		I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+	}
+}
+
 static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 					   unsigned int dev, unsigned int fun)
 {
@@ -132,6 +255,8 @@ static struct res_config i10nm_cfg0 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config i10nm_cfg1 = {
@@ -139,6 +264,8 @@ static struct res_config i10nm_cfg1 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config spr_cfg = {
@@ -147,6 +274,8 @@ static struct res_config spr_cfg = {
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
 	.support_ddr5		= true,
+	.offsets_scrub		= offsets_scrub_spr,
+	.offsets_demand		= offsets_demand_spr,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
@@ -286,6 +415,7 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 
 	cfg = (struct res_config *)id->driver_data;
+	res_cfg = cfg;
 
 	rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm);
 	if (rc)
@@ -339,6 +469,12 @@ static int __init i10nm_init(void)
 	mce_register_decode_chain(&i10nm_mce_dec);
 	setup_i10nm_debug();
 
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, show_retry_rd_err_log);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(true);
+	}
+
 	i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
 
 	return 0;
@@ -350,6 +486,13 @@ static int __init i10nm_init(void)
 static void __exit i10nm_exit(void)
 {
 	edac_dbg(2, "\n");
+
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, NULL);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(false);
+	}
+
 	teardown_i10nm_debug();
 	mce_unregister_decode_chain(&i10nm_mce_dec);
 	skx_adxl_put();
@@ -359,5 +502,8 @@ static void __exit i10nm_exit(void)
 module_init(i10nm_init);
 module_exit(i10nm_exit);
 
+module_param(retry_rd_err_log, int, 0444);
+MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
+
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors");
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 4dbd46575bfb..1abc020d49ab 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -230,7 +230,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 #define SKX_ILV_TARGET(tgt)	((tgt) & 7)
 
 static void skx_show_retry_rd_err_log(struct decoded_addr *res,
-				      char *msg, int len)
+				      char *msg, int len,
+				      bool scrub_err)
 {
 	u32 log0, log1, log2, log3, log4;
 	u32 corr0, corr1, corr2, corr3;
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 81c3e2ec6f56..05ffd39d6de9 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -494,6 +494,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
+	bool scrub_err = false;
 	bool recoverable;
 	int len;
 	u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
@@ -545,6 +546,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			break;
 		case 4:
 			optype = "memory scrubbing error";
+			scrub_err = true;
 			break;
 		default:
 			optype = "reserved";
@@ -567,7 +569,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	}
 
 	if (skx_show_retry_rd_err_log)
-		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len);
+		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err);
 
 	edac_dbg(0, "%s\n", skx_msg);
 
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index bf56bebff138..22e174c740be 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -66,6 +66,8 @@ struct skx_dev {
 		struct skx_channel {
 			struct pci_dev	*cdev;
 			struct pci_dev	*edev;
+			u32 retry_rd_err_log_s;
+			u32 retry_rd_err_log_d;
 			struct skx_dimm {
 				u8 close_pg;
 				u8 bank_xor_enable;
@@ -123,12 +125,15 @@ struct res_config {
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
 	bool support_ddr5;
+	/* Offsets of retry_rd_err_log registers */
+	u32 *offsets_scrub;
+	u32 *offsets_demand;
 };
 
 typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
 				 struct res_config *cfg);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
-typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
+typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err);
 
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
-- 
Gitee


From f82dbd0e34a668a0fc38aa46107fb0f2866eb1d9 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 11 Jun 2021 10:01:18 -0700
Subject: [PATCH 011/132] EDAC/skx_common: Add new ADXL components for 2-level
 memory

mainline inclusion
from mainline-v5.14-rc1
commit 2f4348e5a86198704368a699a7c4cdeb21d569f5
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 2f4348e5a861 EDAC/skx_common: Add new ADXL components for 2-level
 memory.
Backport to add EDAC 2LM support.

--------------------------------

Some Intel servers may configure memory in 2 levels, using
fast "near" memory (e.g. DDR) as a cache for larger, slower,
"far" memory (e.g. 3D X-point).

In these configurations the BIOS ADXL address translation for
an address in a 2-level memory range will provide details of
both the "near" and far components.

Current exported ADXL components are only for 1-level memory
system or for 2nd level memory of 2-level memory system. So
add new ADXL components for 1st level memory of 2-level memory
system to fully support 2-level memory system and the detection
of memory error source(1st level memory or 2nd level memory).

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-2-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 67 ++++++++++++++++++++++++++++++++-------
 drivers/edac/skx_common.h | 11 +++++++
 2 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 05ffd39d6de9..3d0b391a400d 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -23,10 +23,13 @@
 #include "skx_common.h"
 
 static const char * const component_names[] = {
-	[INDEX_SOCKET]	= "ProcessorSocketId",
-	[INDEX_MEMCTRL]	= "MemoryControllerId",
-	[INDEX_CHANNEL]	= "ChannelId",
-	[INDEX_DIMM]	= "DimmSlotId",
+	[INDEX_SOCKET]		= "ProcessorSocketId",
+	[INDEX_MEMCTRL]		= "MemoryControllerId",
+	[INDEX_CHANNEL]		= "ChannelId",
+	[INDEX_DIMM]		= "DimmSlotId",
+	[INDEX_NM_MEMCTRL]	= "NmMemoryControllerId",
+	[INDEX_NM_CHANNEL]	= "NmChannelId",
+	[INDEX_NM_DIMM]		= "NmDimmSlotId",
 };
 
 static int component_indices[ARRAY_SIZE(component_names)];
@@ -34,12 +37,14 @@ static int adxl_component_count;
 static const char * const *adxl_component_names;
 static u64 *adxl_values;
 static char *adxl_msg;
+static unsigned long adxl_nm_bitmap;
 
 static char skx_msg[MSG_SIZE];
 static skx_decode_f skx_decode;
 static skx_show_retry_log_f skx_show_retry_rd_err_log;
 static u64 skx_tolm, skx_tohm;
 static LIST_HEAD(dev_edac_list);
+static bool skx_mem_cfg_2lm;
 
 int __init skx_adxl_get(void)
 {
@@ -56,14 +61,25 @@ int __init skx_adxl_get(void)
 		for (j = 0; names[j]; j++) {
 			if (!strcmp(component_names[i], names[j])) {
 				component_indices[i] = j;
+
+				if (i >= INDEX_NM_FIRST)
+					adxl_nm_bitmap |= 1 << i;
+
 				break;
 			}
 		}
 
-		if (!names[j])
+		if (!names[j] && i < INDEX_NM_FIRST)
 			goto err;
 	}
 
+	if (skx_mem_cfg_2lm) {
+		if (!adxl_nm_bitmap)
+			skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n");
+		else
+			edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap);
+	}
+
 	adxl_component_names = names;
 	while (*names++)
 		adxl_component_count++;
@@ -99,7 +115,7 @@ void __exit skx_adxl_put(void)
 	kfree(adxl_msg);
 }
 
-static bool skx_adxl_decode(struct decoded_addr *res)
+static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
 {
 	struct skx_dev *d;
 	int i, len = 0;
@@ -116,11 +132,20 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	}
 
 	res->socket  = (int)adxl_values[component_indices[INDEX_SOCKET]];
-	res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
-	res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
-	res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	if (error_in_1st_level_mem) {
+		res->imc     = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
+		res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
+		res->dimm    = (adxl_nm_bitmap & BIT_NM_DIMM) ?
+			       (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
+	} else {
+		res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
+		res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
+		res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	}
 
-	if (res->imc > NUM_IMC - 1) {
+	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
 		skx_printk(KERN_ERR, "Bad imc %d\n", res->imc);
 		return false;
 	}
@@ -151,6 +176,11 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	return true;
 }
 
+void skx_set_mem_cfg(bool mem_cfg_2lm)
+{
+	skx_mem_cfg_2lm = mem_cfg_2lm;
+}
+
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
 {
 	skx_decode = decode;
@@ -580,6 +610,21 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			     optype, skx_msg);
 }
 
+static bool skx_error_in_1st_level_mem(const struct mce *m)
+{
+	u32 errcode;
+
+	if (!skx_mem_cfg_2lm)
+		return false;
+
+	errcode = GET_BITFIELD(m->status, 0, 15);
+
+	if ((errcode & 0xef80) != 0x280)
+		return false;
+
+	return true;
+}
+
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -599,7 +644,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	res.addr = mce->addr;
 
 	if (adxl_component_count) {
-		if (!skx_adxl_decode(&res))
+		if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))
 			return NOTIFY_DONE;
 	} else if (!skx_decode || !skx_decode(&res)) {
 		return NOTIFY_DONE;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 22e174c740be..de9f82977f0e 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -9,6 +9,8 @@
 #ifndef _SKX_COMM_EDAC_H
 #define _SKX_COMM_EDAC_H
 
+#include <linux/bits.h>
+
 #define MSG_SIZE		1024
 
 /*
@@ -94,9 +96,17 @@ enum {
 	INDEX_MEMCTRL,
 	INDEX_CHANNEL,
 	INDEX_DIMM,
+	INDEX_NM_FIRST,
+	INDEX_NM_MEMCTRL = INDEX_NM_FIRST,
+	INDEX_NM_CHANNEL,
+	INDEX_NM_DIMM,
 	INDEX_MAX
 };
 
+#define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
+#define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
+#define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
+
 struct decoded_addr {
 	struct skx_dev *dev;
 	u64	addr;
@@ -138,6 +148,7 @@ typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int le
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
+void skx_set_mem_cfg(bool mem_cfg_2lm);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
 int skx_get_node_id(struct skx_dev *d, u8 *id);
-- 
Gitee


From 6b81ee370e7b72e93ff4d3bef67c02ed4a35632b Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 11 Jun 2021 10:01:19 -0700
Subject: [PATCH 012/132] EDAC/i10nm: Add detection of memory levels for
 ICX/SPR servers

mainline inclusion
from mainline-v5.14-rc1
commit 4bd4d32e9a38d7ffb091b4109ab63c8f601e5678
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit 4bd4d32e9a38 EDAC/i10nm: Add detection of memory levels for ICX/SPR
 servers.
Backport to add EDAC 2LM support.

--------------------------------

Current i10nm_edac driver is only for system configured in 1-level
memory. If the system is configured in 2-level memory, the driver
doesn't report the 1st level memory DIMM for the error address, even
if the error occurs in the 1st level memory.

Both Ice Lake servers and Sapphire Rapids servers can be configured
in 2-level memory. Add detection of memory levels to i10nm_edac for
the two kinds of servers so that the driver can report the 2nd level
memory DIMM or the 1st level memory DIMM according to error source.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-3-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 39 +++++++++++++++++++++++++++++++++++++++
 drivers/edac/skx_common.h |  3 +++
 2 files changed, 42 insertions(+)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 7798de1e2aed..4e2a4d8396da 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -24,6 +24,8 @@
 	pci_read_config_dword((d)->uracu, 0xd0, &(reg))
 #define I10NM_GET_IMC_BAR(d, i, reg)	\
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
+#define I10NM_GET_SAD(d, offset, i, reg)\
+	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
 	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
@@ -50,6 +52,10 @@
 #define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
 #define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
 
+#define I10NM_MAX_SAD			16
+#define I10NM_SAD_ENABLE(reg)		GET_BITFIELD(reg, 0, 0)
+#define I10NM_SAD_NM_CACHEABLE(reg)	GET_BITFIELD(reg, 5, 5)
+
 static struct list_head *i10nm_edac_list;
 
 static struct res_config *res_cfg;
@@ -186,6 +192,31 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 	return pdev;
 }
 
+static bool i10nm_check_2lm(struct res_config *cfg)
+{
+	struct skx_dev *d;
+	u32 reg;
+	int i;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1],
+						 PCI_SLOT(cfg->sad_all_devfn),
+						 PCI_FUNC(cfg->sad_all_devfn));
+		if (!d->sad_all)
+			continue;
+
+		for (i = 0; i < I10NM_MAX_SAD; i++) {
+			I10NM_GET_SAD(d, cfg->sad_all_offset, i, reg);
+			if (I10NM_SAD_ENABLE(reg) && I10NM_SAD_NM_CACHEABLE(reg)) {
+				edac_dbg(2, "2-level memory configuration.\n");
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
 static int i10nm_get_all_munits(void)
 {
 	struct pci_dev *mdev;
@@ -255,6 +286,8 @@ static struct res_config i10nm_cfg0 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
 	.offsets_scrub		= offsets_scrub_icx,
 	.offsets_demand		= offsets_demand_icx,
 };
@@ -264,6 +297,8 @@ static struct res_config i10nm_cfg1 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
 	.offsets_scrub		= offsets_scrub_icx,
 	.offsets_demand		= offsets_demand_icx,
 };
@@ -274,6 +309,8 @@ static struct res_config spr_cfg = {
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
 	.support_ddr5		= true,
+	.sad_all_devfn		= PCI_DEVFN(10, 0),
+	.sad_all_offset		= 0x300,
 	.offsets_scrub		= offsets_scrub_spr,
 	.offsets_demand		= offsets_demand_spr,
 };
@@ -429,6 +466,8 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 	}
 
+	skx_set_mem_cfg(i10nm_check_2lm(cfg));
+
 	rc = i10nm_get_all_munits();
 	if (rc < 0)
 		goto fail;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index de9f82977f0e..1fb7540a7092 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -135,6 +135,9 @@ struct res_config {
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
 	bool support_ddr5;
+	/* SAD device number and function number */
+	unsigned int sad_all_devfn;
+	int sad_all_offset;
 	/* Offsets of retry_rd_err_log registers */
 	u32 *offsets_scrub;
 	u32 *offsets_demand;
-- 
Gitee


From 45875988308087e7fae3b15f217ae376765b126b Mon Sep 17 00:00:00 2001
From: root <root@DNP-HBM01.bj.intel.com>
Date: Wed, 13 Jul 2022 15:28:12 +0800
Subject: [PATCH 013/132] EDAC/i10nm: Add support for high bandwidth memory

mainline inclusion
from mainline-v5.14-rc1
commit c945088384d00e6eb61535cc4ba25bc062090909
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit c945088384d0 EDAC/i10nm: Add support for high bandwidth memory.
Backport to add EDAC HBM support.

--------------------------------

A future Xeon processor will include in-package HBM (high bandwidth
memory). The in-package HBM memory controller shares the same
architecture with the regular DDR memory controller.

Add the HBM memory controller devices for EDAC support.

Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210611170123.1057025-4-tony.luck@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 132 ++++++++++++++++++++++++++++++++++----
 drivers/edac/skx_common.c |  15 +++--
 drivers/edac/skx_common.h |  20 +++++-
 3 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 4e2a4d8396da..47d6d7282c58 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -13,7 +13,7 @@
 #include "edac_module.h"
 #include "skx_common.h"
 
-#define I10NM_REVISION	"v0.0.4"
+#define I10NM_REVISION	"v0.0.5"
 #define EDAC_MOD_STR	"i10nm_edac"
 
 /* Debug macros */
@@ -26,14 +26,22 @@
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
 #define I10NM_GET_SAD(d, offset, i, reg)\
 	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
+#define I10NM_GET_HBM_IMC_BAR(d, reg)	\
+	pci_read_config_dword((d)->uracu, 0xd4, &(reg))
+#define I10NM_GET_CAPID3_CFG(d, reg)	\
+	pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
-	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \
+	(i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i)	\
-	readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_MCMTR(m, i)		\
-	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_AMAP(m, i)		\
-	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_REG32(m, i, offset)	\
 	readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
 #define I10NM_GET_REG64(m, i, offset)	\
@@ -45,6 +53,12 @@
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
+#define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg)	\
+	((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000)
+
+#define I10NM_HBM_IMC_MMIO_SIZE		0x9000
+#define I10NM_IS_HBM_PRESENT(reg)	GET_BITFIELD(reg, 27, 30)
+#define I10NM_IS_HBM_IMC(reg)		GET_BITFIELD(reg, 29, 29)
 
 #define RETRY_RD_ERR_LOG_UC		BIT(1)
 #define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
@@ -217,7 +231,7 @@ static bool i10nm_check_2lm(struct res_config *cfg)
 	return false;
 }
 
-static int i10nm_get_all_munits(void)
+static int i10nm_get_ddr_munits(void)
 {
 	struct pci_dev *mdev;
 	void __iomem *mbase;
@@ -245,7 +259,7 @@ static int i10nm_get_all_munits(void)
 		edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
 			 j++, base, reg);
 
-		for (i = 0; i < I10NM_NUM_IMC; i++) {
+		for (i = 0; i < I10NM_NUM_DDR_IMC; i++) {
 			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
 						   12 + i, 0);
 			if (i == 0 && !mdev) {
@@ -281,6 +295,90 @@ static int i10nm_get_all_munits(void)
 	return 0;
 }
 
+static bool i10nm_check_hbm_imc(struct skx_dev *d)
+{
+	u32 reg;
+
+	if (I10NM_GET_CAPID3_CFG(d, reg)) {
+		i10nm_printk(KERN_ERR, "Failed to get capid3_cfg\n");
+		return false;
+	}
+
+	return I10NM_IS_HBM_PRESENT(reg) != 0;
+}
+
+static int i10nm_get_hbm_munits(void)
+{
+	struct pci_dev *mdev;
+	void __iomem *mbase;
+	u32 reg, off, mcmtr;
+	struct skx_dev *d;
+	int i, lmc;
+	u64 base;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3);
+		if (!d->pcu_cr3)
+			return -ENODEV;
+
+		if (!i10nm_check_hbm_imc(d)) {
+			i10nm_printk(KERN_DEBUG, "No hbm memory\n");
+			return -ENODEV;
+		}
+
+		if (I10NM_GET_SCK_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get socket bar\n");
+			return -ENODEV;
+		}
+		base = I10NM_GET_SCK_MMIO_BASE(reg);
+
+		if (I10NM_GET_HBM_IMC_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get hbm mc bar\n");
+			return -ENODEV;
+		}
+		base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg);
+
+		lmc = I10NM_NUM_DDR_IMC;
+
+		for (i = 0; i < I10NM_NUM_HBM_IMC; i++) {
+			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
+						   12 + i / 4, 1 + i % 4);
+			if (i == 0 && !mdev) {
+				i10nm_printk(KERN_ERR, "No hbm mc found\n");
+				return -ENODEV;
+			}
+			if (!mdev)
+				continue;
+
+			d->imc[lmc].mdev = mdev;
+			off = i * I10NM_HBM_IMC_MMIO_SIZE;
+
+			edac_dbg(2, "hbm mc%d mmio base 0x%llx size 0x%x\n",
+				 lmc, base + off, I10NM_HBM_IMC_MMIO_SIZE);
+
+			mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE);
+			if (!mbase) {
+				i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n",
+					     base + off);
+				return -ENOMEM;
+			}
+
+			d->imc[lmc].mbase = mbase;
+			d->imc[lmc].hbm_mc = true;
+
+			mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0);
+			if (!I10NM_IS_HBM_IMC(mcmtr)) {
+				i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n");
+				return -ENODEV;
+			}
+
+			lmc++;
+		}
+	}
+
+	return 0;
+}
+
 static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
@@ -308,6 +406,7 @@ static struct res_config spr_cfg = {
 	.decs_did		= 0x3252,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
+	.hbm_chan_mmio_sz	= 0x4000,
 	.support_ddr5		= true,
 	.sad_all_devfn		= PCI_DEVFN(10, 0),
 	.sad_all_offset		= 0x300,
@@ -345,14 +444,14 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
 	struct dimm_info *dimm;
 	int i, j, ndimms;
 
-	for (i = 0; i < I10NM_NUM_CHANNELS; i++) {
+	for (i = 0; i < imc->num_channels; i++) {
 		if (!imc->mbase)
 			continue;
 
 		ndimms = 0;
 		mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i);
 		amap = I10NM_GET_AMAP(imc, i);
-		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
+		for (j = 0; j < imc->num_dimms; j++) {
 			dimm = edac_get_dimm(mci, i, j, 0);
 			mtr = I10NM_GET_DIMMMTR(imc, i, j);
 			edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n",
@@ -468,8 +567,9 @@ static int __init i10nm_init(void)
 
 	skx_set_mem_cfg(i10nm_check_2lm(cfg));
 
-	rc = i10nm_get_all_munits();
-	if (rc < 0)
+	rc = i10nm_get_ddr_munits();
+
+	if (i10nm_get_hbm_munits() && rc)
 		goto fail;
 
 	list_for_each_entry(d, i10nm_edac_list, list) {
@@ -490,7 +590,15 @@ static int __init i10nm_init(void)
 			d->imc[i].lmc = i;
 			d->imc[i].src_id  = src_id;
 			d->imc[i].node_id = node_id;
-			d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+			if (d->imc[i].hbm_mc) {
+				d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_HBM_DIMMS;
+			} else {
+				d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_DDR_DIMMS;
+			}
 
 			rc = skx_register_mci(&d->imc[i], d->imc[i].mdev,
 					      "Intel_10nm Socket", EDAC_MOD_STR,
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 3d0b391a400d..0c133d32b777 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -343,9 +343,9 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 
 	ranks = numrank(mtr);
 	rows = numrow(mtr);
-	cols = numcol(mtr);
+	cols = imc->hbm_mc ? 6 : numcol(mtr);
 
-	if (cfg->support_ddr5 && (amap & 0x8)) {
+	if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) {
 		banks = 32;
 		mtype = MEM_DDR5;
 	} else {
@@ -374,8 +374,13 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	dimm->dtype = get_width(mtr);
 	dimm->mtype = mtype;
 	dimm->edac_mode = EDAC_SECDED; /* likely better than this */
-	snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
-		 imc->src_id, imc->lmc, chan, dimmno);
+
+	if (imc->hbm_mc)
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u",
+			 imc->src_id, imc->lmc, chan);
+	else
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
+			 imc->src_id, imc->lmc, chan, dimmno);
 
 	return 1;
 }
@@ -705,6 +710,8 @@ void skx_remove(void)
 		}
 		if (d->util_all)
 			pci_dev_put(d->util_all);
+		if (d->pcu_cr3)
+			pci_dev_put(d->pcu_cr3);
 		if (d->sad_all)
 			pci_dev_put(d->sad_all);
 		if (d->uracu)
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 1fb7540a7092..03ac067a80b9 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -32,9 +32,17 @@
 #define SKX_NUM_CHANNELS	3	/* Channels per memory controller */
 #define SKX_NUM_DIMMS		2	/* Max DIMMS per channel */
 
-#define I10NM_NUM_IMC		4
-#define I10NM_NUM_CHANNELS	2
-#define I10NM_NUM_DIMMS		2
+#define I10NM_NUM_DDR_IMC	4
+#define I10NM_NUM_DDR_CHANNELS	2
+#define I10NM_NUM_DDR_DIMMS	2
+
+#define I10NM_NUM_HBM_IMC	16
+#define I10NM_NUM_HBM_CHANNELS	2
+#define I10NM_NUM_HBM_DIMMS	1
+
+#define I10NM_NUM_IMC		(I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC)
+#define I10NM_NUM_CHANNELS	MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS)
+#define I10NM_NUM_DIMMS		MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS)
 
 #define MAX(a, b)	((a) > (b) ? (a) : (b))
 #define NUM_IMC		MAX(SKX_NUM_IMC, I10NM_NUM_IMC)
@@ -56,12 +64,16 @@ struct skx_dev {
 	struct pci_dev *sad_all;
 	struct pci_dev *util_all;
 	struct pci_dev *uracu; /* for i10nm CPU */
+	struct pci_dev *pcu_cr3; /* for HBM memory detection */
 	u32 mcroute;
 	struct skx_imc {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
 		void __iomem *mbase;  /* for i10nm CPU */
 		int chan_mmio_sz;     /* for i10nm CPU */
+		int num_channels; /* channels per memory controller */
+		int num_dimms; /* dimms per channel */
+		bool hbm_mc;
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id, node_id;
@@ -134,6 +146,8 @@ struct res_config {
 	int busno_cfg_offset;
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
+	/* Per HBM channel memory-mapped I/O size */
+	int hbm_chan_mmio_sz;
 	bool support_ddr5;
 	/* SAD device number and function number */
 	unsigned int sad_all_devfn;
-- 
Gitee


From b0adb31550986f669e45c7f42f13876451a91d0e Mon Sep 17 00:00:00 2001
From: Naveen Krishna Chatradhi <nchatrad@amd.com>
Date: Wed, 30 Jun 2021 20:58:24 +0530
Subject: [PATCH 014/132] EDAC/mc: Add new HBM2 memory type

mainline inclusion
from mainline-v5.13
commit e1ca90b7cc5cb5d3a38321cbb65ad36a59fcb574
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit e1ca90b7cc5c EDAC/mc: Add new HBM2 memory type.
Backport to add EDAC HBM support.

--------------------------------

Add a new entry to 'enum mem_type' and a new string to 'edac_mem_types[]'
for HBM2 (High Bandwidth Memory Gen 2) new memory type.

Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210630152828.162659-4-nchatrad@amd.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/edac_mc.c | 1 +
 include/linux/edac.h   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 34514c638f19..bf4297075c22 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -163,6 +163,7 @@ const char * const edac_mem_types[] = {
 	[MEM_LRDDR4]	= "Load-Reduced-DDR4-RAM",
 	[MEM_DDR5]	= "Unbuffered-DDR5",
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
+	[MEM_HBM2]	= "High-bandwidth-memory-Gen2",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 6c4565cc6273..490134495008 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -181,6 +181,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_LRDDR4:		Load-Reduced DDR4 memory.
  * @MEM_DDR5:		Unbuffered DDR5 RAM
  * @MEM_NVDIMM:		Non-volatile RAM
+ * @MEM_HBM2:		High bandwidth Memory Gen 2.
  */
 enum mem_type {
 	MEM_EMPTY = 0,
@@ -206,6 +207,7 @@ enum mem_type {
 	MEM_LRDDR4,
 	MEM_DDR5,
 	MEM_NVDIMM,
+	MEM_HBM2,
 };
 
 #define MEM_FLAG_EMPTY		BIT(MEM_EMPTY)
@@ -230,6 +232,7 @@ enum mem_type {
 #define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
 #define MEM_FLAG_DDR5           BIT(MEM_DDR5)
 #define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
+#define MEM_FLAG_HBM2		BIT(MEM_HBM2)
 
 /**
  * enum edac-type - Error Detection and Correction capabilities and mode
-- 
Gitee


From 4a111addf4c90fc79db41b634b5b5b8b788d0f09 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Tue, 20 Jul 2021 09:30:09 -0700
Subject: [PATCH 015/132] EDAC/skx_common: Set the memory type correctly for
 HBM memory

mainline inclusion
from mainline-v5.15-rc1
commit fd07a4a0d30b5468a1f4a0739e34f5f014df7d44
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit fd07a4a0d30b EDAC/skx_common: Set the memory type correctly for HBM
 memory.
Backport to add EDAC HBM support.

--------------------------------

Set the memory type to MEM_HBM2 if it's managed by the HBM2
memory controller.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20210720163009.GA1417532@agluck-desk2.amr.corp.intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/skx_common.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 0c133d32b777..19c17c5198c5 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -345,7 +345,10 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	rows = numrow(mtr);
 	cols = imc->hbm_mc ? 6 : numcol(mtr);
 
-	if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) {
+	if (imc->hbm_mc) {
+		banks = 32;
+		mtype = MEM_HBM2;
+	} else if (cfg->support_ddr5 && (amap & 0x8)) {
 		banks = 32;
 		mtype = MEM_DDR5;
 	} else {
-- 
Gitee


From 028e1264e604bafb90f67c5533b2ad86b0cc8ea1 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 24 Dec 2021 04:11:26 -0500
Subject: [PATCH 016/132] EDAC/i10nm: Release mdev/mbase when failing to detect
 HBM

mainline inclusion
from mainline-v5.16
commit c370baa328022cbd46c59c821d1b467a97f047be
category: feature
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1
CVE: NA

Intel-SIG: commit c370baa32802 EDAC/i10nm: Release mdev/mbase when failing to detect HBM.
Backport to add EDAC HBM support.

--------------------------------

On systems without HBM (High Bandwidth Memory) mdev/mbase are not
released/unmapped.

Add the code to release mdev/mbase when failing to detect HBM.

[Tony: re-word commit message]

Cc: <stable@vger.kernel.org>
Fixes: c945088384d0 ("EDAC/i10nm: Add support for high bandwidth memory")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20211224091126.1246-1-qiuxu.zhuo@intel.com
Signed-off-by: Youquan Song <youquan.song@intel.com>
---
 drivers/edac/i10nm_base.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 47d6d7282c58..d63ddc9c994d 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -358,6 +358,9 @@ static int i10nm_get_hbm_munits(void)
 
 			mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE);
 			if (!mbase) {
+				pci_dev_put(d->imc[lmc].mdev);
+				d->imc[lmc].mdev = NULL;
+
 				i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n",
 					     base + off);
 				return -ENOMEM;
@@ -368,6 +371,12 @@ static int i10nm_get_hbm_munits(void)
 
 			mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0);
 			if (!I10NM_IS_HBM_IMC(mcmtr)) {
+				iounmap(d->imc[lmc].mbase);
+				d->imc[lmc].mbase = NULL;
+				d->imc[lmc].hbm_mc = false;
+				pci_dev_put(d->imc[lmc].mdev);
+				d->imc[lmc].mdev = NULL;
+
 				i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n");
 				return -ENODEV;
 			}
-- 
Gitee


From 910f3dd810c9bab8332d53f2c32c55f5fe65a64c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 26 Jul 2022 17:38:14 +0800
Subject: [PATCH 017/132] etherdevice: Adjust ether_addr* prototypes to silence
 -Wstringop-overead
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit 08ad7a770efacfecf903143f8de88d1e351a1f2d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=08ad7a770efacfecf903143f8de88d1e351a1f2d

--------------------------------

commit 2618a0dae09ef37728dab89ff60418cbe25ae6bd upstream.

With GCC 12, -Wstringop-overread was warning about an implicit cast from
char[6] to char[8]. However, the extra 2 bytes are always thrown away,
alignment doesn't matter, and the risk of hitting the edge of unallocated
memory has been accepted, so this prototype can just be converted to a
regular char *. Silences:

net/core/dev.c: In function ‘bpf_prog_run_generic_xdp’: net/core/dev.c:4618:21: warning: ‘ether_addr_equal_64bits’ reading 8 bytes from a region of size 6 [-Wstringop-overread]
 4618 |         orig_host = ether_addr_equal_64bits(eth->h_dest, > skb->dev->dev_addr);
      |                     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
net/core/dev.c:4618:21: note: referencing argument 1 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’}
net/core/dev.c:4618:21: note: referencing argument 2 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’}
In file included from net/core/dev.c:91: include/linux/etherdevice.h:375:20: note: in a call to function ‘ether_addr_equal_64bits’
  375 | static inline bool ether_addr_equal_64bits(const u8 addr1[6+2],
      |                    ^~~~~~~~~~~~~~~~~~~~~~~

Reported-by: Marc Kleine-Budde <mkl@pengutronix.de>
Tested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/netdev/20220212090811.uuzk6d76agw2vv73@pengutronix.de
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Cc: Khem Raj <raj.khem@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/linux/etherdevice.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 2e5debc0373c..99209f50915f 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -127,7 +127,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr)
 #endif
 }
 
-static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2])
+static inline bool is_multicast_ether_addr_64bits(const u8 *addr)
 {
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
 #ifdef __BIG_ENDIAN
@@ -352,8 +352,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
  * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.
  */
 
-static inline bool ether_addr_equal_64bits(const u8 addr1[6+2],
-					   const u8 addr2[6+2])
+static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2)
 {
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
 	u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2);
-- 
Gitee


From 0ccd441a7641338e1220f8a244da9159b653ff5b Mon Sep 17 00:00:00 2001
From: Xiongwei Song <sxwjean@gmail.com>
Date: Tue, 26 Jul 2022 17:38:15 +0800
Subject: [PATCH 018/132] mm: page_alloc: fix building error on
 -Werror=array-compare

stable inclusion
from stable-v5.10.113
commit 69848f9488bc1088bcd9a73987dff8d2cb47a060
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=69848f9488bc1088bcd9a73987dff8d2cb47a060

--------------------------------

commit ca831f29f8f25c97182e726429b38c0802200c8f upstream.

Arthur Marsh reported we would hit the error below when building kernel
with gcc-12:

  CC      mm/page_alloc.o
  mm/page_alloc.c: In function `mem_init_print_info':
  mm/page_alloc.c:8173:27: error: comparison between two arrays [-Werror=array-compare]
   8173 |                 if (start <= pos && pos < end && size > adj) \
        |

In C++20, the comparision between arrays should be warned.

Link: https://lkml.kernel.org/r/20211125130928.32465-1-sxwjean@me.com
Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
Reported-by: Arthur Marsh <arthur.marsh@internode.on.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Khem Raj <raj.khem@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ec73cca1726c..cf9c69d631f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7974,7 +7974,7 @@ void __init mem_init_print_info(const char *str)
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
 	do { \
-		if (start <= pos && pos < end && size > adj) \
+		if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
 			size -= adj; \
 	} while (0)
 
-- 
Gitee


From d0005c0a490428fb4f12a085d77c1bb34bf1dbf5 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Tue, 26 Jul 2022 17:38:16 +0800
Subject: [PATCH 019/132] tracing: Dump stacktrace trigger to the corresponding
 instance

mainline inclusion
from mainline-v5.17-rc6
commit ce33c845b030c9cf768370c951bc699470b09fa7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ce33c845b030c9cf768370c951bc699470b09fa7

--------------------------------

The stacktrace event trigger is not dumping the stacktrace to the instance
where it was enabled, but to the global "instance."

Use the private_data, pointing to the trigger file, to figure out the
corresponding trace instance, and use it in the trigger action, like
snapshot_trigger does.

Link: https://lkml.kernel.org/r/afbb0b4f18ba92c276865bc97204d438473f4ebc.1645396236.git.bristot@kernel.org

Cc: stable@vger.kernel.org
Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables")
Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Tested-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

[zzk: __trace_stack() was changed due to the merge of "36590c50b2d0 tracing: Merge irqflags + preempt counter",
cherry-pick from mainline ce33c845b030c9 instead of stable 5.10.y]

Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Yang Jihong <yangjihong1@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/trace/trace_events_trigger.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d0309de2f84f..3c6229f16e81 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1219,7 +1219,12 @@ static void
 stacktrace_trigger(struct event_trigger_data *data, void *rec,
 		   struct ring_buffer_event *event)
 {
-	trace_dump_stack(STACK_SKIP);
+	struct trace_event_file *file = data->private_data;
+
+	if (file)
+		__trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+	else
+		trace_dump_stack(STACK_SKIP);
 }
 
 static void
-- 
Gitee


From 32af01826a81a3f51b3e13d211351569f762f6c2 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 26 Jul 2022 17:38:17 +0800
Subject: [PATCH 020/132] perf tools: Fix segfault accessing sample_id xyarray

stable inclusion
from stable-v5.10.113
commit 378061c9b886994fa045186390d61a5e7c696ae3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=378061c9b886994fa045186390d61a5e7c696ae3

--------------------------------

commit a668cc07f990d2ed19424d5c1a529521a9d1cee1 upstream.

perf_evsel::sample_id is an xyarray which can cause a segfault when
accessed beyond its size. e.g.

  # perf record -e intel_pt// -C 1 sleep 1
  Segmentation fault (core dumped)
  #

That is happening because a dummy event is opened to capture text poke
events accross all CPUs, however the mmap logic is allocating according
to the number of user_requested_cpus.

In general, perf sometimes uses the evsel cpus to open events, and
sometimes the evlist user_requested_cpus. However, it is not necessary
to determine which case is which because the opened event file
descriptors are also in an xyarray, the size of whch can be used
to correctly allocate the size of the sample_id xyarray, because there
is one ID per file descriptor.

Note, in the affected code path, perf_evsel fd array is subsequently
used to get the file descriptor for the mmap, so it makes sense for the
xyarrays to be the same size there.

Fixes: d1a177595b3a824c ("libperf: Adopt perf_evlist__mmap()/munmap() from tools/perf")
Fixes: 246eba8e9041c477 ("perf tools: Add support for PERF_RECORD_TEXT_POKE")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: stable@vger.kernel.org # 5.5+
Link: https://lore.kernel.org/r/20220413114232.26914-1-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 tools/lib/perf/evlist.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index 17465d454a0e..f76b1a9d5a6e 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -571,7 +571,6 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist,
 {
 	struct perf_evsel *evsel;
 	const struct perf_cpu_map *cpus = evlist->cpus;
-	const struct perf_thread_map *threads = evlist->threads;
 
 	if (!ops || !ops->get || !ops->mmap)
 		return -EINVAL;
@@ -583,7 +582,7 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist,
 	perf_evlist__for_each_entry(evlist, evsel) {
 		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
 		    evsel->sample_id == NULL &&
-		    perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0)
+		    perf_evsel__alloc_id(evsel, evsel->fd->max_x, evsel->fd->max_y) < 0)
 			return -ENOMEM;
 	}
 
-- 
Gitee


From 9f91d21694fddc986c0beb30b2680d69457e66ec Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 26 Jul 2022 17:38:18 +0800
Subject: [PATCH 021/132] gfs2: assign rgrp glock before compute_bitstructs

stable inclusion
from stable-v5.10.113
commit 04dd45d9776eb4802df8b0a0a6fcdf27842fbdd1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=04dd45d9776eb4802df8b0a0a6fcdf27842fbdd1

--------------------------------

commit 428f651cb80b227af47fc302e4931791f2fb4741 upstream.

Before this patch, function read_rindex_entry called compute_bitstructs
before it allocated a glock for the rgrp. But if compute_bitstructs found
a problem with the rgrp, it called gfs2_consist_rgrpd, and that called
gfs2_dump_glock for rgd->rd_gl which had not yet been assigned.

read_rindex_entry
   compute_bitstructs
      gfs2_consist_rgrpd
         gfs2_dump_glock <---------rgd->rd_gl was not set.

This patch changes read_rindex_entry so it assigns an rgrp glock before
calling compute_bitstructs so gfs2_dump_glock does not reference an
unassigned pointer. If an error is discovered, the glock must also be
put, so a new goto and label were added.

Reported-by: syzbot+c6fd14145e2f62ca0784@syzkaller.appspotmail.com
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/gfs2/rgrp.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index dc55b029afaa..c5bde789a16d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -906,15 +906,15 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
 	spin_lock_init(&rgd->rd_rsspin);
 
-	error = compute_bitstructs(rgd);
-	if (error)
-		goto fail;
-
 	error = gfs2_glock_get(sdp, rgd->rd_addr,
 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
 	if (error)
 		goto fail;
 
+	error = compute_bitstructs(rgd);
+	if (error)
+		goto fail_glock;
+
 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
 	if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -928,6 +928,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	}
 
 	error = 0; /* someone else read in the rgrp; free it and ignore it */
+fail_glock:
 	gfs2_glock_put(rgd->rd_gl);
 
 fail:
-- 
Gitee


From 302f6418e23bbcecc28eafb2b9b50b3fd5de7806 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 26 Jul 2022 17:38:19 +0800
Subject: [PATCH 022/132] ALSA: usb-audio: Clear MIDI port active flag after
 draining

stable inclusion
from stable-v5.10.113
commit 8ce3820fc9d496100fb39008a952eebfb695b060
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8ce3820fc9d496100fb39008a952eebfb695b060

--------------------------------

commit 0665886ad1392e6b5bae85d7a6ccbed48dca1522 upstream.

When a rawmidi output stream is closed, it calls the drain at first,
then does trigger-off only when the drain returns -ERESTARTSYS as a
fallback.  It implies that each driver should turn off the stream
properly after the drain.  Meanwhile, USB-audio MIDI interface didn't
change the port->active flag after the drain.  This may leave the
output work picking up the port that is closed right now, which
eventually leads to a use-after-free for the already released rawmidi
object.

This patch fixes the bug by properly clearing the port->active flag
after the output drain.

Reported-by: syzbot+70e777a39907d6d5fd0a@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/00000000000011555605dceaff03@google.com
Link: https://lore.kernel.org/r/20220420130247.22062-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/usb/midi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/usb/midi.c b/sound/usb/midi.c
index fa91290ad89d..84676a8fb60d 100644
--- a/sound/usb/midi.c
+++ b/sound/usb/midi.c
@@ -1210,6 +1210,7 @@ static void snd_usbmidi_output_drain(struct snd_rawmidi_substream *substream)
 		} while (drain_urbs && timeout);
 		finish_wait(&ep->drain_wait, &wait);
 	}
+	port->active = 0;
 	spin_unlock_irq(&ep->buffer_lock);
 }
 
-- 
Gitee


From 6942c4a9bb82dd73e524481428c69ffad9c9a445 Mon Sep 17 00:00:00 2001
From: Tim Crawford <tcrawford@system76.com>
Date: Tue, 26 Jul 2022 17:38:20 +0800
Subject: [PATCH 023/132] ALSA: hda/realtek: Add quirk for Clevo NP70PNP

stable inclusion
from stable-v5.10.113
commit cf9b19546494b2e5df87486752946ae71c6775db
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf9b19546494b2e5df87486752946ae71c6775db

--------------------------------

commit 86222af07abf1f5f07a5873cc399c29ab8a9b8b8 upstream.

Fixes headset detection on Clevo NP70PNP.

Signed-off-by: Tim Crawford <tcrawford@system76.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20220421170412.3697-1-tcrawford@system76.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 11d653190e6e..b5168959fcf6 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -8897,6 +8897,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1558, 0x8562, "Clevo NH[5|7][0-9]RZ[Q]", ALC269_FIXUP_DMIC),
 	SND_PCI_QUIRK(0x1558, 0x8668, "Clevo NP50B[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1558, 0x866d, "Clevo NP5[05]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
+	SND_PCI_QUIRK(0x1558, 0x867c, "Clevo NP7[01]PNP", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1558, 0x867d, "Clevo NP7[01]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1558, 0x8680, "Clevo NJ50LU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1558, 0x8686, "Clevo NH50[CZ]U", ALC256_FIXUP_MIC_NO_PRESENCE_AND_RESUME),
-- 
Gitee


From 83ab10a7a9ff529b76e8517e74380568f7b81797 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 26 Jul 2022 17:38:21 +0800
Subject: [PATCH 024/132] ASoC: atmel: Remove system clock tree configuration
 for at91sam9g20ek

stable inclusion
from stable-v5.10.113
commit 608fc58858bfa7552a9824c2f0e4a3ab8dd4efaa
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=608fc58858bfa7552a9824c2f0e4a3ab8dd4efaa

--------------------------------

[ Upstream commit c775cbf62ed4911e4f0f23880f01815753123690 ]

The MCLK of the WM8731 on the AT91SAM9G20-EK board is connected to the
PCK0 output of the SoC, intended in the reference software to be supplied
using PLLB and programmed to 12MHz. As originally written for use with a
board file the audio driver was responsible for configuring the entire tree
but in the conversion to the common clock framework the registration of
the named pck0 and pllb clocks was removed so the driver has failed to
instantiate ever since.

Since the WM8731 driver has had support for managing a MCLK provided via
the common clock framework for some time we can simply drop all the clock
management code from the machine driver other than configuration of the
sysclk rate, the CODEC driver still respects that configuration from the
machine driver.

Fixes: ff78a189b0ae55f ("ARM: at91: remove old at91-specific clock driver")
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Codrin Ciubotariu <codrin.ciubotariu@microchip.com>
Link: https://lore.kernel.org/r/20220325154241.1600757-2-broonie@kernel.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/soc/atmel/sam9g20_wm8731.c | 61 --------------------------------
 1 file changed, 61 deletions(-)

diff --git a/sound/soc/atmel/sam9g20_wm8731.c b/sound/soc/atmel/sam9g20_wm8731.c
index 8a55d59a6c2a..d243de5f23dc 100644
--- a/sound/soc/atmel/sam9g20_wm8731.c
+++ b/sound/soc/atmel/sam9g20_wm8731.c
@@ -46,35 +46,6 @@
  */
 #undef ENABLE_MIC_INPUT
 
-static struct clk *mclk;
-
-static int at91sam9g20ek_set_bias_level(struct snd_soc_card *card,
-					struct snd_soc_dapm_context *dapm,
-					enum snd_soc_bias_level level)
-{
-	static int mclk_on;
-	int ret = 0;
-
-	switch (level) {
-	case SND_SOC_BIAS_ON:
-	case SND_SOC_BIAS_PREPARE:
-		if (!mclk_on)
-			ret = clk_enable(mclk);
-		if (ret == 0)
-			mclk_on = 1;
-		break;
-
-	case SND_SOC_BIAS_OFF:
-	case SND_SOC_BIAS_STANDBY:
-		if (mclk_on)
-			clk_disable(mclk);
-		mclk_on = 0;
-		break;
-	}
-
-	return ret;
-}
-
 static const struct snd_soc_dapm_widget at91sam9g20ek_dapm_widgets[] = {
 	SND_SOC_DAPM_MIC("Int Mic", NULL),
 	SND_SOC_DAPM_SPK("Ext Spk", NULL),
@@ -135,7 +106,6 @@ static struct snd_soc_card snd_soc_at91sam9g20ek = {
 	.owner = THIS_MODULE,
 	.dai_link = &at91sam9g20ek_dai,
 	.num_links = 1,
-	.set_bias_level = at91sam9g20ek_set_bias_level,
 
 	.dapm_widgets = at91sam9g20ek_dapm_widgets,
 	.num_dapm_widgets = ARRAY_SIZE(at91sam9g20ek_dapm_widgets),
@@ -148,7 +118,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct device_node *codec_np, *cpu_np;
-	struct clk *pllb;
 	struct snd_soc_card *card = &snd_soc_at91sam9g20ek;
 	int ret;
 
@@ -162,31 +131,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	/*
-	 * Codec MCLK is supplied by PCK0 - set it up.
-	 */
-	mclk = clk_get(NULL, "pck0");
-	if (IS_ERR(mclk)) {
-		dev_err(&pdev->dev, "Failed to get MCLK\n");
-		ret = PTR_ERR(mclk);
-		goto err;
-	}
-
-	pllb = clk_get(NULL, "pllb");
-	if (IS_ERR(pllb)) {
-		dev_err(&pdev->dev, "Failed to get PLLB\n");
-		ret = PTR_ERR(pllb);
-		goto err_mclk;
-	}
-	ret = clk_set_parent(mclk, pllb);
-	clk_put(pllb);
-	if (ret != 0) {
-		dev_err(&pdev->dev, "Failed to set MCLK parent\n");
-		goto err_mclk;
-	}
-
-	clk_set_rate(mclk, MCLK_RATE);
-
 	card->dev = &pdev->dev;
 
 	/* Parse device node info */
@@ -230,9 +174,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev)
 
 	return ret;
 
-err_mclk:
-	clk_put(mclk);
-	mclk = NULL;
 err:
 	atmel_ssc_put_audio(0);
 	return ret;
@@ -242,8 +183,6 @@ static int at91sam9g20ek_audio_remove(struct platform_device *pdev)
 {
 	struct snd_soc_card *card = platform_get_drvdata(pdev);
 
-	clk_disable(mclk);
-	mclk = NULL;
 	snd_soc_unregister_card(card);
 	atmel_ssc_put_audio(0);
 
-- 
Gitee


From f7ec810a47d7f46e7b1739befe01d4850fd7ca31 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Tue, 26 Jul 2022 17:38:22 +0800
Subject: [PATCH 025/132] ASoC: msm8916-wcd-digital: Check failure for
 devm_snd_soc_register_component

stable inclusion
from stable-v5.10.113
commit b6f474cd3097b9570d5c2c30075ca3506fbbcf33
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b6f474cd3097b9570d5c2c30075ca3506fbbcf33

--------------------------------

[ Upstream commit e927b05f3cc20de87f6b7d912a5bbe556931caca ]

devm_snd_soc_register_component() may fails, we should check the error
and do the corresponding error handling.

Fixes: 150db8c5afa1 ("ASoC: codecs: Add msm8916-wcd digital codec")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Link: https://lore.kernel.org/r/20220403115239.30140-1-linmq006@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/soc/codecs/msm8916-wcd-digital.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c
index 9ad7fc0baf07..20a07c92b2fc 100644
--- a/sound/soc/codecs/msm8916-wcd-digital.c
+++ b/sound/soc/codecs/msm8916-wcd-digital.c
@@ -1206,9 +1206,16 @@ static int msm8916_wcd_digital_probe(struct platform_device *pdev)
 
 	dev_set_drvdata(dev, priv);
 
-	return devm_snd_soc_register_component(dev, &msm8916_wcd_digital,
+	ret = devm_snd_soc_register_component(dev, &msm8916_wcd_digital,
 				      msm8916_wcd_digital_dai,
 				      ARRAY_SIZE(msm8916_wcd_digital_dai));
+	if (ret)
+		goto err_mclk;
+
+	return 0;
+
+err_mclk:
+	clk_disable_unprepare(priv->mclk);
 err_clk:
 	clk_disable_unprepare(priv->ahbclk);
 	return ret;
-- 
Gitee


From ecf9eaf316dc293ccd09a729f924840b09fe60db Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Tue, 26 Jul 2022 17:38:23 +0800
Subject: [PATCH 026/132] ASoC: codecs: wcd934x: do not switch off SIDO Buck
 when codec is in use

stable inclusion
from stable-v5.10.113
commit 12aa8021c7a72811cd8096b1e456cd5d265896d9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=12aa8021c7a72811cd8096b1e456cd5d265896d9

--------------------------------

[ Upstream commit db6dd1bee63d1d88fbddfe07af800af5948ac28e ]

SIDO(Single-Inductor Dual-Ouput) Buck powers up both analog and digital
circuits along with internal memory, powering off this is the last thing
that codec should do when going to very low power.

Current code was powering off this Buck if there are no users of sysclk,
which is not correct. Powering off this buck will result in no register access.
This code path was never tested until recently after adding pm support
in SoundWire controller. Fix this by removing the buck poweroff when the
codec is active and also the code that is not used.

Without this patch all the read/write transactions will never complete and
results in SLIMBus Errors like:

qcom,slim-ngd qcom,slim-ngd.1: Tx:MT:0x0, MC:0x60, LA:0xcf failed:-110
wcd934x-codec wcd934x-codec.1.auto: ASoC: error at soc_component_read_no_lock
	on wcd934x-codec.1.auto for register: [0x00000d05] -110
qcom,slim-ngd-ctrl 171c0000.slim: Error Interrupt received 0x82000000

Reported-by: Amit Pundir <amit.pundir@linaro.org>
Fixes: a61f3b4f476e ("ASoC: wcd934x: add support to wcd9340/wcd9341 codec")
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Tested-by: Amit Pundir <amit.pundir@linaro.org>
Link: https://lore.kernel.org/r/20220407094313.2880-1-srinivas.kandagatla@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/soc/codecs/wcd934x.c | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c
index 8540ac230d0e..fd704df9b175 100644
--- a/sound/soc/codecs/wcd934x.c
+++ b/sound/soc/codecs/wcd934x.c
@@ -1188,29 +1188,7 @@ static int wcd934x_set_sido_input_src(struct wcd934x_codec *wcd, int sido_src)
 	if (sido_src == wcd->sido_input_src)
 		return 0;
 
-	if (sido_src == SIDO_SOURCE_INTERNAL) {
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, 0);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_HI_ACCU_PRE_ENX_MASK, 0x0);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO,
-				   WCD934X_ANA_RCO_BG_EN_MASK, 0);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_PRE_EN1_MASK,
-				   WCD934X_ANA_BUCK_PRE_EN1_ENABLE);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_PRE_EN2_MASK,
-				   WCD934X_ANA_BUCK_PRE_EN2_ENABLE);
-		usleep_range(100, 110);
-		regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL,
-				   WCD934X_ANA_BUCK_HI_ACCU_EN_MASK,
-				   WCD934X_ANA_BUCK_HI_ACCU_ENABLE);
-		usleep_range(100, 110);
-	} else if (sido_src == SIDO_SOURCE_RCO_BG) {
+	if (sido_src == SIDO_SOURCE_RCO_BG) {
 		regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO,
 				   WCD934X_ANA_RCO_BG_EN_MASK,
 				   WCD934X_ANA_RCO_BG_ENABLE);
@@ -1296,8 +1274,6 @@ static int wcd934x_disable_ana_bias_and_syclk(struct wcd934x_codec *wcd)
 	regmap_update_bits(wcd->regmap, WCD934X_CLK_SYS_MCLK_PRG,
 			   WCD934X_EXT_CLK_BUF_EN_MASK |
 			   WCD934X_MCLK_EN_MASK, 0x0);
-	wcd934x_set_sido_input_src(wcd, SIDO_SOURCE_INTERNAL);
-
 	regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS,
 			   WCD934X_ANA_BIAS_EN_MASK, 0);
 	regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS,
-- 
Gitee


From 4be6d449af6b349bdeeb388d9a40f814d8120112 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Tue, 26 Jul 2022 17:38:24 +0800
Subject: [PATCH 027/132] dmaengine: imx-sdma: Fix error checking in
 sdma_event_remap

stable inclusion
from stable-v5.10.113
commit 9bc949a181ba805b76b81d1ae1519af093328b81
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9bc949a181ba805b76b81d1ae1519af093328b81

--------------------------------

[ Upstream commit 7104b9cb35a33ad803a1adbbfa50569b008faf15 ]

of_parse_phandle() returns NULL on errors, rather than error
pointers. Using NULL check on grp_np to fix this.

Fixes: d078cd1b4185 ("dmaengine: imx-sdma: Add imx6sx platform support")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Link: https://lore.kernel.org/r/20220308064952.15743-1-linmq006@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/imx-sdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 306f93e4b26a..792c91cd1608 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1789,7 +1789,7 @@ static int sdma_event_remap(struct sdma_engine *sdma)
 	u32 reg, val, shift, num_map, i;
 	int ret = 0;
 
-	if (IS_ERR(np) || IS_ERR(gpr_np))
+	if (IS_ERR(np) || !gpr_np)
 		goto out;
 
 	event_remap = of_find_property(np, propname, NULL);
@@ -1837,7 +1837,7 @@ static int sdma_event_remap(struct sdma_engine *sdma)
 	}
 
 out:
-	if (!IS_ERR(gpr_np))
+	if (gpr_np)
 		of_node_put(gpr_np);
 
 	return ret;
-- 
Gitee


From dfc6a02d0e0f49da209a4e3fce9e1e52775ae7a8 Mon Sep 17 00:00:00 2001
From: zhangqilong <zhangqilong3@huawei.com>
Date: Tue, 26 Jul 2022 17:38:25 +0800
Subject: [PATCH 028/132] dmaengine: mediatek:Fix PM usage reference leak of
 mtk_uart_apdma_alloc_chan_resources

stable inclusion
from stable-v5.10.113
commit f714abf28f819849d2fd93a4a8db15cff3f8798e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f714abf28f819849d2fd93a4a8db15cff3f8798e

--------------------------------

[ Upstream commit 545b2baac89b859180e51215468c05d85ea8465a ]

pm_runtime_get_sync will increment pm usage counter even it failed.
Forgetting to putting operation will result in reference leak here.
We fix it:
1) Replacing it with pm_runtime_resume_and_get to keep usage counter
   balanced.
2) Add putting operation before returning error.

Fixes:9135408c3ace4 ("dmaengine: mediatek: Add MediaTek UART APDMA support")
Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com>
Link: https://lore.kernel.org/r/20220319022142.142709-1-zhangqilong3@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/mediatek/mtk-uart-apdma.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/mediatek/mtk-uart-apdma.c b/drivers/dma/mediatek/mtk-uart-apdma.c
index 375e7e647df6..a1517ef1f4a0 100644
--- a/drivers/dma/mediatek/mtk-uart-apdma.c
+++ b/drivers/dma/mediatek/mtk-uart-apdma.c
@@ -274,7 +274,7 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan)
 	unsigned int status;
 	int ret;
 
-	ret = pm_runtime_get_sync(mtkd->ddev.dev);
+	ret = pm_runtime_resume_and_get(mtkd->ddev.dev);
 	if (ret < 0) {
 		pm_runtime_put_noidle(chan->device->dev);
 		return ret;
@@ -288,18 +288,21 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan)
 	ret = readx_poll_timeout(readl, c->base + VFF_EN,
 			  status, !status, 10, 100);
 	if (ret)
-		return ret;
+		goto err_pm;
 
 	ret = request_irq(c->irq, mtk_uart_apdma_irq_handler,
 			  IRQF_TRIGGER_NONE, KBUILD_MODNAME, chan);
 	if (ret < 0) {
 		dev_err(chan->device->dev, "Can't request dma IRQ\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err_pm;
 	}
 
 	if (mtkd->support_33bits)
 		mtk_uart_apdma_write(c, VFF_4G_SUPPORT, VFF_4G_SUPPORT_CLR_B);
 
+err_pm:
+	pm_runtime_put_noidle(mtkd->ddev.dev);
 	return ret;
 }
 
-- 
Gitee


From 4ab150b4794d5ccbae344f03323fce6d73d66fcd Mon Sep 17 00:00:00 2001
From: Allen-KH Cheng <allen-kh.cheng@mediatek.com>
Date: Tue, 26 Jul 2022 17:38:26 +0800
Subject: [PATCH 029/132] spi: spi-mtk-nor: initialize spi controller after
 resume

stable inclusion
from stable-v5.10.113
commit 3f7914dbeacdffe4fa198dee7afefa74998b5d8d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3f7914dbeacdffe4fa198dee7afefa74998b5d8d

--------------------------------

[ Upstream commit 317c2045618cc1f8d38beb8c93a7bdb6ad8638c6 ]

After system resumes, the registers of nor controller are
initialized with default values. The nor controller will
not function properly.

To handle both issues above, we add mtk_nor_init() in
mtk_nor_resume after pm_runtime_force_resume().

Fixes: 3bfd9103c7af ("spi: spi-mtk-nor: Add power management support")

Signed-off-by: Allen-KH Cheng <allen-kh.cheng@mediatek.com>
Reviewed-by: Rex-BC Chen <rex-bc.chen@mediatek.com>
Link: https://lore.kernel.org/r/20220412115743.22641-1-allen-kh.cheng@mediatek.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/spi/spi-mtk-nor.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/spi/spi-mtk-nor.c b/drivers/spi/spi-mtk-nor.c
index 288f6c2bbd57..106e3cacba4c 100644
--- a/drivers/spi/spi-mtk-nor.c
+++ b/drivers/spi/spi-mtk-nor.c
@@ -895,7 +895,17 @@ static int __maybe_unused mtk_nor_suspend(struct device *dev)
 
 static int __maybe_unused mtk_nor_resume(struct device *dev)
 {
-	return pm_runtime_force_resume(dev);
+	struct spi_controller *ctlr = dev_get_drvdata(dev);
+	struct mtk_nor *sp = spi_controller_get_devdata(ctlr);
+	int ret;
+
+	ret = pm_runtime_force_resume(dev);
+	if (ret)
+		return ret;
+
+	mtk_nor_init(sp);
+
+	return 0;
 }
 
 static const struct dev_pm_ops mtk_nor_pm_ops = {
-- 
Gitee


From 79b7e6ed7a6170a8f469c8035db58f163600a410 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 26 Jul 2022 17:38:27 +0800
Subject: [PATCH 030/132] esp: limit skb_page_frag_refill use to a single page

stable inclusion
from stable-v5.10.113
commit c075c3ea031757f8ea2d34567565b61a868c08d5
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c075c3ea031757f8ea2d34567565b61a868c08d5

--------------------------------

[ Upstream commit 5bd8baab087dff657e05387aee802e70304cc813 ]

Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP
transformation") tried to fix skb_page_frag_refill usage in ESP by
capping allocsize to 32k, but that doesn't completely solve the issue,
as skb_page_frag_refill may return a single page. If that happens, we
will write out of bounds, despite the check introduced in the previous
patch.

This patch forces COW in cases where we would end up calling
skb_page_frag_refill with a size larger than a page (first in
esp_output_head with tailen, then in esp_output_tail with
skb->data_len).

Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible")
Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/net/esp.h | 2 --
 net/ipv4/esp4.c   | 5 ++---
 net/ipv6/esp6.c   | 5 ++---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/net/esp.h b/include/net/esp.h
index 90cd02ff77ef..9c5637d41d95 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -4,8 +4,6 @@
 
 #include <linux/skbuff.h>
 
-#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER)
-
 struct ip_esp_hdr;
 
 static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9aae82145bc1..20d738137841 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -448,7 +448,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
 	struct page *page;
 	struct sk_buff *trailer;
 	int tailen = esp->tailen;
-	unsigned int allocsz;
 
 	/* this is non-NULL only with TCP/UDP Encapsulation */
 	if (x->encap) {
@@ -458,8 +457,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
 			return err;
 	}
 
-	allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES);
-	if (allocsz > ESP_SKB_FRAG_MAXSIZE)
+	if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+	    ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
 		goto cow;
 
 	if (!skb_cloned(skb)) {
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 20c7bef6829e..cb28f8928f9e 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -483,7 +483,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	struct page *page;
 	struct sk_buff *trailer;
 	int tailen = esp->tailen;
-	unsigned int allocsz;
 
 	if (x->encap) {
 		int err = esp6_output_encap(x, skb, esp);
@@ -492,8 +491,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 			return err;
 	}
 
-	allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES);
-	if (allocsz > ESP_SKB_FRAG_MAXSIZE)
+	if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+	    ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
 		goto cow;
 
 	if (!skb_cloned(skb)) {
-- 
Gitee


From f648582b76cf544acd8efbeed6f5fa0020acf160 Mon Sep 17 00:00:00 2001
From: Sasha Neftin <sasha.neftin@intel.com>
Date: Tue, 26 Jul 2022 17:38:28 +0800
Subject: [PATCH 031/132] igc: Fix infinite loop in release_swfw_sync

stable inclusion
from stable-v5.10.113
commit 46b0e4f998ceead45f273be4d282b7fbb70f6125
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=46b0e4f998ceead45f273be4d282b7fbb70f6125

--------------------------------

[ Upstream commit 907862e9aef75bf89e2b265efcc58870be06081e ]

An infinite loop may occur if we fail to acquire the HW semaphore,
which is needed for resource release.
This will typically happen if the hardware is surprise-removed.
At this stage there is nothing to do, except log an error and quit.

Fixes: c0071c7aa5fe ("igc: Add HW initialization code")
Suggested-by: Dima Ruinskiy <dima.ruinskiy@intel.com>
Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Naama Meir <naamax.meir@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/intel/igc/igc_i225.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_i225.c b/drivers/net/ethernet/intel/igc/igc_i225.c
index 553d6bc78e6b..624236a4202e 100644
--- a/drivers/net/ethernet/intel/igc/igc_i225.c
+++ b/drivers/net/ethernet/intel/igc/igc_i225.c
@@ -156,8 +156,15 @@ void igc_release_swfw_sync_i225(struct igc_hw *hw, u16 mask)
 {
 	u32 swfw_sync;
 
-	while (igc_get_hw_semaphore_i225(hw))
-		; /* Empty */
+	/* Releasing the resource requires first getting the HW semaphore.
+	 * If we fail to get the semaphore, there is nothing we can do,
+	 * except log an error and quit. We are not allowed to hang here
+	 * indefinitely, as it may cause denial of service or system crash.
+	 */
+	if (igc_get_hw_semaphore_i225(hw)) {
+		hw_dbg("Failed to release SW_FW_SYNC.\n");
+		return;
+	}
 
 	swfw_sync = rd32(IGC_SW_FW_SYNC);
 	swfw_sync &= ~mask;
-- 
Gitee


From 39d9023b5d9c8d03091fb5c7c0781836854451bb Mon Sep 17 00:00:00 2001
From: Sasha Neftin <sasha.neftin@intel.com>
Date: Tue, 26 Jul 2022 17:38:29 +0800
Subject: [PATCH 032/132] igc: Fix BUG: scheduling while atomic

stable inclusion
from stable-v5.10.113
commit fc7116a79a86500e15520658f279b6536ff9f700
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc7116a79a86500e15520658f279b6536ff9f700

--------------------------------

[ Upstream commit c80a29f0fe9b6f5457e0788e27d1110577eba99b ]

Replace usleep_range() method with udelay() method to allow atomic contexts
in low-level MDIO access functions.

The following issue can be seen by doing the following:
$ modprobe -r bonding
$ modprobe -v bonding max_bonds=1 mode=1 miimon=100 use_carrier=0
$ ip link set bond0 up
$ ifenslave bond0 eth0 eth1

[  982.357308] BUG: scheduling while atomic: kworker/u64:0/9/0x00000002
[  982.364431] INFO: lockdep is turned off.
[  982.368824] Modules linked in: bonding sctp ip6_udp_tunnel udp_tunnel mlx4_ib ib_uverbs ib_core mlx4_en mlx4_core nfp tls sunrpc intel_rapl_msr iTCO_wdt iTCO_vendor_support mxm_wmi dcdbas intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel rapl intel_cstate intel_uncore pcspkr lpc_ich mei_me ipmi_ssif mei ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter xfs libcrc32c sr_mod cdrom sd_mod t10_pi sg mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ahci libahci crc32c_intel libata i2c_algo_bit tg3 megaraid_sas igc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: bonding]
[  982.437941] CPU: 25 PID: 9 Comm: kworker/u64:0 Kdump: loaded Tainted: G        W        --------- -  - 4.18.0-348.el8.x86_64+debug #1
[  982.451333] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 2.7.0 12/005/2017
[  982.459791] Workqueue: bond0 bond_mii_monitor [bonding]
[  982.465622] Call Trace:
[  982.468355]  dump_stack+0x8e/0xd0
[  982.472056]  __schedule_bug.cold.60+0x3a/0x60
[  982.476919]  __schedule+0x147b/0x1bc0
[  982.481007]  ? firmware_map_remove+0x16b/0x16b
[  982.485967]  ? hrtimer_fixup_init+0x40/0x40
[  982.490625]  schedule+0xd9/0x250
[  982.494227]  schedule_hrtimeout_range_clock+0x10d/0x2c0
[  982.500058]  ? hrtimer_nanosleep_restart+0x130/0x130
[  982.505598]  ? hrtimer_init_sleeper_on_stack+0x90/0x90
[  982.511332]  ? usleep_range+0x88/0x130
[  982.515514]  ? recalibrate_cpu_khz+0x10/0x10
[  982.520279]  ? ktime_get+0xab/0x1c0
[  982.524175]  ? usleep_range+0x88/0x130
[  982.528355]  usleep_range+0xdd/0x130
[  982.532344]  ? console_conditional_schedule+0x30/0x30
[  982.537987]  ? igc_put_hw_semaphore+0x17/0x60 [igc]
[  982.543432]  igc_read_phy_reg_gpy+0x111/0x2b0 [igc]
[  982.548887]  igc_phy_has_link+0xfa/0x260 [igc]
[  982.553847]  ? igc_get_phy_id+0x210/0x210 [igc]
[  982.558894]  ? lock_acquire+0x34d/0x890
[  982.563187]  ? lock_downgrade+0x710/0x710
[  982.567659]  ? rcu_read_unlock+0x50/0x50
[  982.572039]  igc_check_for_copper_link+0x106/0x210 [igc]
[  982.577970]  ? igc_config_fc_after_link_up+0x840/0x840 [igc]
[  982.584286]  ? rcu_read_unlock+0x50/0x50
[  982.588661]  ? lock_release+0x591/0xb80
[  982.592939]  ? lock_release+0x591/0xb80
[  982.597220]  igc_has_link+0x113/0x330 [igc]
[  982.601887]  ? lock_downgrade+0x710/0x710
[  982.606362]  igc_ethtool_get_link+0x6d/0x90 [igc]
[  982.611614]  bond_check_dev_link+0x131/0x2c0 [bonding]
[  982.617350]  ? bond_time_in_interval+0xd0/0xd0 [bonding]
[  982.623277]  ? rcu_read_lock_held+0x62/0xc0
[  982.627944]  ? rcu_read_lock_sched_held+0xe0/0xe0
[  982.633198]  bond_mii_monitor+0x314/0x2500 [bonding]
[  982.638738]  ? lock_contended+0x880/0x880
[  982.643214]  ? bond_miimon_link_change+0xa0/0xa0 [bonding]
[  982.649336]  ? lock_acquire+0x34d/0x890
[  982.653615]  ? lock_downgrade+0x710/0x710
[  982.658089]  ? debug_object_deactivate+0x221/0x340
[  982.663436]  ? rcu_read_unlock+0x50/0x50
[  982.667811]  ? debug_print_object+0x2b0/0x2b0
[  982.672672]  ? __switch_to_asm+0x41/0x70
[  982.677049]  ? __switch_to_asm+0x35/0x70
[  982.681426]  ? _raw_spin_unlock_irq+0x24/0x40
[  982.686288]  ? trace_hardirqs_on+0x20/0x195
[  982.690956]  ? _raw_spin_unlock_irq+0x24/0x40
[  982.695818]  process_one_work+0x8f0/0x1770
[  982.700390]  ? pwq_dec_nr_in_flight+0x320/0x320
[  982.705443]  ? debug_show_held_locks+0x50/0x50
[  982.710403]  worker_thread+0x87/0xb40
[  982.714489]  ? process_one_work+0x1770/0x1770
[  982.719349]  kthread+0x344/0x410
[  982.722950]  ? kthread_insert_work_sanity_check+0xd0/0xd0
[  982.728975]  ret_from_fork+0x3a/0x50

Fixes: 5586838fe9ce ("igc: Add code for PHY support")
Reported-by: Corinna Vinschen <vinschen@redhat.com>
Suggested-by: Dima Ruinskiy <dima.ruinskiy@intel.com>
Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Corinna Vinschen <vinschen@redhat.com>
Tested-by: Naama Meir <naamax.meir@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/intel/igc/igc_phy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c
index e380b7a3ea63..8de4de2e5636 100644
--- a/drivers/net/ethernet/intel/igc/igc_phy.c
+++ b/drivers/net/ethernet/intel/igc/igc_phy.c
@@ -583,7 +583,7 @@ static s32 igc_read_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 *data)
 	 * the lower time out
 	 */
 	for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) {
-		usleep_range(500, 1000);
+		udelay(50);
 		mdic = rd32(IGC_MDIC);
 		if (mdic & IGC_MDIC_READY)
 			break;
@@ -640,7 +640,7 @@ static s32 igc_write_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 data)
 	 * the lower time out
 	 */
 	for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) {
-		usleep_range(500, 1000);
+		udelay(50);
 		mdic = rd32(IGC_MDIC);
 		if (mdic & IGC_MDIC_READY)
 			break;
-- 
Gitee


From ab18d18b75f5f270c8f87b9743007d904f37b66e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 26 Jul 2022 17:38:30 +0800
Subject: [PATCH 033/132] rxrpc: Restore removed timer deletion

stable inclusion
from stable-v5.10.113
commit 60592f16a456d239df9060968dd9cb561d0a6fb0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=60592f16a456d239df9060968dd9cb561d0a6fb0

--------------------------------

[ Upstream commit ee3b0826b4764f6c13ad6db67495c5a1c38e9025 ]

A recent patch[1] from Eric Dumazet flipped the order in which the
keepalive timer and the keepalive worker were cancelled in order to fix a
syzbot reported issue[2].  Unfortunately, this enables the mirror image bug
whereby the timer races with rxrpc_exit_net(), restarting the worker after
it has been cancelled:

	CPU 1		CPU 2
	===============	=====================
			if (rxnet->live)
			<INTERRUPT>
	rxnet->live = false;
 	cancel_work_sync(&rxnet->peer_keepalive_work);
			rxrpc_queue_work(&rxnet->peer_keepalive_work);
	del_timer_sync(&rxnet->peer_keepalive_timer);

Fix this by restoring the removed del_timer_sync() so that we try to remove
the timer twice.  If the timer runs again, it should see ->live == false
and not restart the worker.

Fixes: 1946014ca3b1 ("rxrpc: fix a race in rxrpc_exit_net()")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Eric Dumazet <edumazet@google.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/20220404183439.3537837-1-eric.dumazet@gmail.com/ [1]
Link: https://syzkaller.appspot.com/bug?extid=724378c4bb58f703b09a [2]
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/rxrpc/net_ns.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index f15d6942da45..cc7e30733feb 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -113,7 +113,9 @@ static __net_exit void rxrpc_exit_net(struct net *net)
 	struct rxrpc_net *rxnet = rxrpc_net(net);
 
 	rxnet->live = false;
+	del_timer_sync(&rxnet->peer_keepalive_timer);
 	cancel_work_sync(&rxnet->peer_keepalive_work);
+	/* Remove the timer again as the worker may have restarted it. */
 	del_timer_sync(&rxnet->peer_keepalive_timer);
 	rxrpc_destroy_all_calls(rxnet);
 	rxrpc_destroy_all_connections(rxnet);
-- 
Gitee


From 27b6e01f60e73dc8944d2d18ed50a60a32e68f08 Mon Sep 17 00:00:00 2001
From: Tony Lu <tonylu@linux.alibaba.com>
Date: Tue, 26 Jul 2022 17:38:31 +0800
Subject: [PATCH 034/132] net/smc: Fix sock leak when release after
 smc_shutdown()

stable inclusion
from stable-v5.10.113
commit a499cb5f3ef9f976eac96c02adbcc505764e2b91
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a499cb5f3ef9f976eac96c02adbcc505764e2b91

--------------------------------

[ Upstream commit 1a74e99323746353bba11562a2f2d0aa8102f402 ]

Since commit e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown
and fallback"), for a fallback connection, __smc_release() does not call
sock_put() if its state is already SMC_CLOSED.

When calling smc_shutdown() after falling back, its state is set to
SMC_CLOSED but does not call sock_put(), so this patch calls it.

Reported-and-tested-by: syzbot+6e29a053eb165bd50de5@syzkaller.appspotmail.com
Fixes: e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown and fallback")
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/smc/af_smc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 4f16d406ad8e..1b98f3241150 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -2144,8 +2144,10 @@ static int smc_shutdown(struct socket *sock, int how)
 	if (smc->use_fallback) {
 		rc = kernel_sock_shutdown(smc->clcsock, how);
 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
-		if (sk->sk_shutdown == SHUTDOWN_MASK)
+		if (sk->sk_shutdown == SHUTDOWN_MASK) {
 			sk->sk_state = SMC_CLOSED;
+			sock_put(sk);
+		}
 		goto out;
 	}
 	switch (how) {
-- 
Gitee


From 3059be689186bbdbc081b5cdc9f8b08e78487795 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 26 Jul 2022 17:38:32 +0800
Subject: [PATCH 035/132] net/packet: fix packet_sock xmit return value
 checking

stable inclusion
from stable-v5.10.113
commit 8fb76adb89f0d576c107e05fd922bf3d04c470d9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8fb76adb89f0d576c107e05fd922bf3d04c470d9

--------------------------------

[ Upstream commit 29e8e659f984be00d75ec5fef4e37c88def72712 ]

packet_sock xmit could be dev_queue_xmit, which also returns negative
errors. So only checking positive errors is not enough, or userspace
sendmsg may return success while packet is not send out.

Move the net_xmit_errno() assignment in the braces as checkpatch.pl said
do not use assignment in if condition.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/packet/af_packet.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d0c95d7dd292..5ee600d108a0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2817,8 +2817,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 
 		status = TP_STATUS_SEND_REQUEST;
 		err = po->xmit(skb);
-		if (unlikely(err > 0)) {
-			err = net_xmit_errno(err);
+		if (unlikely(err != 0)) {
+			if (err > 0)
+				err = net_xmit_errno(err);
 			if (err && __packet_get_status(po, ph) ==
 				   TP_STATUS_AVAILABLE) {
 				/* skb was destructed already */
@@ -3019,8 +3020,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		skb->no_fcs = 1;
 
 	err = po->xmit(skb);
-	if (err > 0 && (err = net_xmit_errno(err)) != 0)
-		goto out_unlock;
+	if (unlikely(err != 0)) {
+		if (err > 0)
+			err = net_xmit_errno(err);
+		if (err)
+			goto out_unlock;
+	}
 
 	dev_put(dev);
 
-- 
Gitee


From beeab1369997e4479ef51ac7c3a97460393a01e0 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Tue, 26 Jul 2022 17:38:33 +0800
Subject: [PATCH 036/132] ip6_gre: Avoid updating tunnel->tun_hlen in
 __gre6_xmit()

stable inclusion
from stable-v5.10.113
commit 200f96ebb389788dea563e8a0b1105863a836039
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=200f96ebb389788dea563e8a0b1105863a836039

--------------------------------

[ Upstream commit f40c064e933d7787ca7411b699504d7a2664c1f5 ]

Do not update tunnel->tun_hlen in data plane code.  Use a local variable
instead, just like "tunnel_hlen" in net/ipv4/ip_gre.c:gre_fb_xmit().

Co-developed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/ipv6/ip6_gre.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 9a0263f25232..949d6fbc1ca0 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -743,6 +743,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		struct ip_tunnel_info *tun_info;
 		const struct ip_tunnel_key *key;
 		__be16 flags;
+		int tun_hlen;
 
 		tun_info = skb_tunnel_info_txcheck(skb);
 		if (IS_ERR(tun_info) ||
@@ -760,9 +761,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		dsfield = key->tos;
 		flags = key->tun_flags &
 			(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
-		tunnel->tun_hlen = gre_calc_hlen(flags);
+		tun_hlen = gre_calc_hlen(flags);
 
-		gre_build_header(skb, tunnel->tun_hlen,
+		gre_build_header(skb, tun_hlen,
 				 flags, protocol,
 				 tunnel_id_to_key32(tun_info->key.tun_id),
 				 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
-- 
Gitee


From be914f9d3ae55f43b0733a64e2b769bcd02d7d0c Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Tue, 26 Jul 2022 17:38:34 +0800
Subject: [PATCH 037/132] ip6_gre: Fix skb_under_panic in __gre6_xmit()

stable inclusion
from stable-v5.10.113
commit 93366275be72bc55cada31a2a1273147553acab8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=93366275be72bc55cada31a2a1273147553acab8

--------------------------------

[ Upstream commit ab198e1d0dd8dc4bc7575fb50758e2cbd51e14e1 ]

Feng reported an skb_under_panic BUG triggered by running
test_ip6gretap() in tools/testing/selftests/bpf/test_tunnel.sh:

[   82.492551] skbuff: skb_under_panic: text:ffffffffb268bb8e len:403 put:12 head:ffff9997c5480000 data:ffff9997c547fff8 tail:0x18b end:0x2c0 dev:ip6gretap11
<...>
[   82.607380] Call Trace:
[   82.609389]  <TASK>
[   82.611136]  skb_push.cold.109+0x10/0x10
[   82.614289]  __gre6_xmit+0x41e/0x590
[   82.617169]  ip6gre_tunnel_xmit+0x344/0x3f0
[   82.620526]  dev_hard_start_xmit+0xf1/0x330
[   82.623882]  sch_direct_xmit+0xe4/0x250
[   82.626961]  __dev_queue_xmit+0x720/0xfe0
<...>
[   82.633431]  packet_sendmsg+0x96a/0x1cb0
[   82.636568]  sock_sendmsg+0x30/0x40
<...>

The following sequence of events caused the BUG:

1. During ip6gretap device initialization, tunnel->tun_hlen (e.g. 4) is
   calculated based on old flags (see ip6gre_calc_hlen());
2. packet_snd() reserves header room for skb A, assuming
   tunnel->tun_hlen is 4;
3. Later (in clsact Qdisc), the eBPF program sets a new tunnel key for
   skb A using bpf_skb_set_tunnel_key() (see _ip6gretap_set_tunnel());
4. __gre6_xmit() detects the new tunnel key, and recalculates
   "tun_hlen" (e.g. 12) based on new flags (e.g. TUNNEL_KEY and
   TUNNEL_SEQ);
5. gre_build_header() calls skb_push() with insufficient reserved header
   room, triggering the BUG.

As sugguested by Cong, fix it by moving the call to skb_cow_head() after
the recalculation of tun_hlen.

Reproducer:

  OBJ=$LINUX/tools/testing/selftests/bpf/test_tunnel_kern.o

  ip netns add at_ns0
  ip link add veth0 type veth peer name veth1
  ip link set veth0 netns at_ns0
  ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
  ip netns exec at_ns0 ip link set dev veth0 up
  ip link set dev veth1 up mtu 1500
  ip addr add dev veth1 172.16.1.200/24

  ip netns exec at_ns0 ip addr add ::11/96 dev veth0
  ip netns exec at_ns0 ip link set dev veth0 up
  ip addr add dev veth1 ::22/96
  ip link set dev veth1 up

  ip netns exec at_ns0 \
  	ip link add dev ip6gretap00 type ip6gretap seq flowlabel 0xbcdef key 2 \
  	local ::11 remote ::22

  ip netns exec at_ns0 ip addr add dev ip6gretap00 10.1.1.100/24
  ip netns exec at_ns0 ip addr add dev ip6gretap00 fc80::100/96
  ip netns exec at_ns0 ip link set dev ip6gretap00 up

  ip link add dev ip6gretap11 type ip6gretap external
  ip addr add dev ip6gretap11 10.1.1.200/24
  ip addr add dev ip6gretap11 fc80::200/24
  ip link set dev ip6gretap11 up

  tc qdisc add dev ip6gretap11 clsact
  tc filter add dev ip6gretap11 egress bpf da obj $OBJ sec ip6gretap_set_tunnel
  tc filter add dev ip6gretap11 ingress bpf da obj $OBJ sec ip6gretap_get_tunnel

  ping6 -c 3 -w 10 -q ::11

Fixes: 6712abc168eb ("ip6_gre: add ip6 gre and gretap collect_md mode")
Reported-by: Feng Zhou <zhoufeng.zf@bytedance.com>
Co-developed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/ipv6/ip6_gre.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 949d6fbc1ca0..1f6c752f13b4 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -733,9 +733,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 	else
 		fl6->daddr = tunnel->parms.raddr;
 
-	if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
-		return -ENOMEM;
-
 	/* Push GRE header. */
 	protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
 
@@ -763,6 +760,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 			(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 		tun_hlen = gre_calc_hlen(flags);
 
+		if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen))
+			return -ENOMEM;
+
 		gre_build_header(skb, tun_hlen,
 				 flags, protocol,
 				 tunnel_id_to_key32(tun_info->key.tun_id),
@@ -773,6 +773,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		if (tunnel->parms.o_flags & TUNNEL_SEQ)
 			tunnel->o_seqno++;
 
+		if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
+			return -ENOMEM;
+
 		gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
 				 protocol, tunnel->parms.o_key,
 				 htonl(tunnel->o_seqno));
-- 
Gitee


From cf223164578f4184522fc77982b2d795305ba27e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Jul 2022 17:38:35 +0800
Subject: [PATCH 038/132] net/sched: cls_u32: fix possible leak in
 u32_init_knode()

stable inclusion
from stable-v5.10.113
commit 0ac8f83d8f6471e39b3cfe8ee48560df3bf9c451
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0ac8f83d8f6471e39b3cfe8ee48560df3bf9c451

--------------------------------

[ Upstream commit ec5b0f605b105457f257f2870acad4a5d463984b ]

While investigating a related syzbot report,
I found that whenever call to tcf_exts_init()
from u32_init_knode() is failing, we end up
with an elevated refcount on ht->refcnt

To avoid that, only increase the refcount after
all possible errors have been evaluated.

Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/sched/cls_u32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b61db335c49d..da042bc8b239 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -814,10 +814,6 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
 	new->flags = n->flags;
 	RCU_INIT_POINTER(new->ht_down, ht);
 
-	/* bump reference count as long as we hold pointer to structure */
-	if (ht)
-		ht->refcnt++;
-
 #ifdef CONFIG_CLS_U32_PERF
 	/* Statistics may be incremented by readers during update
 	 * so we must keep them in tact. When the node is later destroyed
@@ -839,6 +835,10 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
 		return NULL;
 	}
 
+	/* bump reference count as long as we hold pointer to structure */
+	if (ht)
+		ht->refcnt++;
+
 	return new;
 }
 
-- 
Gitee


From aa8ab17792740416d50918eaf63bc38ed591fbc2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Tue, 26 Jul 2022 17:38:36 +0800
Subject: [PATCH 039/132] l3mdev: l3mdev_master_upper_ifindex_by_index_rcu
 should be using netdev_master_upper_dev_get_rcu

stable inclusion
from stable-v5.10.113
commit 078d839f11acfa9bf5b549d2189abda9f2a943a0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=078d839f11acfa9bf5b549d2189abda9f2a943a0

--------------------------------

[ Upstream commit 83daab06252ee5d0e1f4373ff28b79304945fc19 ]

Next patch uses l3mdev_master_upper_ifindex_by_index_rcu which throws
a splat with debug kernels:

[13783.087570] ------------[ cut here ]------------
[13783.093974] RTNL: assertion failed at net/core/dev.c (6702)
[13783.100761] WARNING: CPU: 3 PID: 51132 at net/core/dev.c:6702 netdev_master_upper_dev_get+0x16a/0x1a0

[13783.184226] CPU: 3 PID: 51132 Comm: kworker/3:3 Not tainted 5.17.0-custom-100090-g6f963aafb1cc #682
[13783.194788] Hardware name: Mellanox Technologies Ltd. MSN2010/SA002610, BIOS 5.6.5 08/24/2017
[13783.204755] Workqueue: mld mld_ifc_work [ipv6]
[13783.210338] RIP: 0010:netdev_master_upper_dev_get+0x16a/0x1a0
[13783.217209] Code: 0f 85 e3 fe ff ff e8 65 ac ec fe ba 2e 1a 00 00 48 c7 c6 60 6f 38 83 48 c7 c7 c0 70 38 83 c6 05 5e b5 d7 01 01 e8 c6 29 52 00 <0f> 0b e9 b8 fe ff ff e8 5a 6c 35 ff e9 1c ff ff ff 48 89 ef e8 7d
[13783.238659] RSP: 0018:ffffc9000b37f5a8 EFLAGS: 00010286
[13783.244995] RAX: 0000000000000000 RBX: ffff88812ee5c000 RCX: 0000000000000000
[13783.253379] RDX: ffff88811ce09d40 RSI: ffffffff812d0fcd RDI: fffff5200166fea7
[13783.261769] RBP: 0000000000000000 R08: 0000000000000001 R09: ffff8882375f4287
[13783.270138] R10: ffffed1046ebe850 R11: 0000000000000001 R12: dffffc0000000000
[13783.278510] R13: 0000000000000275 R14: ffffc9000b37f688 R15: ffff8881273b4af8
[13783.286870] FS:  0000000000000000(0000) GS:ffff888237400000(0000) knlGS:0000000000000000
[13783.296352] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[13783.303177] CR2: 00007ff25fc9b2e8 CR3: 0000000174d23000 CR4: 00000000001006e0
[13783.311546] Call Trace:
[13783.314660]  <TASK>
[13783.317553]  l3mdev_master_upper_ifindex_by_index_rcu+0x43/0xe0
...

Change l3mdev_master_upper_ifindex_by_index_rcu to use
netdev_master_upper_dev_get_rcu.

Fixes: 6a6d6681ac1a ("l3mdev: add function to retreive upper master")
Signed-off-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: David Ahern <dsahern@kernel.org>
Cc: Alexis Bauvin <abauvin@scaleway.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/l3mdev/l3mdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 864326f150e2..f2c3a61ad134 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -147,7 +147,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
 
 	dev = dev_get_by_index_rcu(net, ifindex);
 	while (dev && !netif_is_l3_master(dev))
-		dev = netdev_master_upper_dev_get(dev);
+		dev = netdev_master_upper_dev_get_rcu(dev);
 
 	return dev ? dev->ifindex : 0;
 }
-- 
Gitee


From 3bd107cbe765f64dbde8b4282a01282e5a9be749 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Jul 2022 17:38:37 +0800
Subject: [PATCH 040/132] ipv6: make ip6_rt_gc_expire an atomic_t

stable inclusion
from stable-v5.10.113
commit 49516e6ed91434d022a800321a8bc7d8054f62ac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=49516e6ed91434d022a800321a8bc7d8054f62ac

--------------------------------

[ Upstream commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db ]

Reads and Writes to ip6_rt_gc_expire always have been racy,
as syzbot reported lately [1]

There is a possible risk of under-flow, leading
to unexpected high value passed to fib6_run_gc(),
although I have not observed this in the field.

Hosts hitting ip6_dst_gc() very hard are under pretty bad
state anyway.

[1]
BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc

read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1:
 ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311
 dst_alloc+0x9b/0x160 net/core/dst.c:86
 ip6_dst_alloc net/ipv6/route.c:344 [inline]
 icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261
 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807
 mld_send_cr net/ipv6/mcast.c:2119 [inline]
 mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651
 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289
 worker_thread+0x618/0xa70 kernel/workqueue.c:2436
 kthread+0x1a9/0x1e0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30

read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0:
 ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311
 dst_alloc+0x9b/0x160 net/core/dst.c:86
 ip6_dst_alloc net/ipv6/route.c:344 [inline]
 icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261
 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807
 mld_send_cr net/ipv6/mcast.c:2119 [inline]
 mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651
 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289
 worker_thread+0x618/0xa70 kernel/workqueue.c:2436
 kthread+0x1a9/0x1e0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30

value changed: 0x00000bb3 -> 0x00000ba9

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: mld mld_ifc_work

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

 Conflicts:
	include/net/netns/ipv6.h
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/net/netns/ipv6.h |  4 ++--
 net/ipv6/route.c         | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index b2a28201f4fd..35e0c4a6c71f 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -79,8 +79,8 @@ struct netns_ipv6 {
 	struct dst_ops		ip6_dst_ops;
 	rwlock_t		fib6_walker_lock;
 	spinlock_t		fib6_gc_lock;
-	unsigned int		 ip6_rt_gc_expire;
-	unsigned long		 ip6_rt_last_gc;
+	atomic_t		ip6_rt_gc_expire;
+	unsigned long		ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	unsigned int		fib6_rules_require_fldissect;
 #endif
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a44ad9637e8a..4ef59dc515e5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3187,6 +3187,7 @@ static int ip6_dst_gc(struct dst_ops *ops)
 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+	unsigned int val;
 	int entries;
 
 	entries = dst_entries_get_fast(ops);
@@ -3197,13 +3198,13 @@ static int ip6_dst_gc(struct dst_ops *ops)
 	    entries <= rt_max_size)
 		goto out;
 
-	net->ipv6.ip6_rt_gc_expire++;
-	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
+	fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
 	entries = dst_entries_get_slow(ops);
 	if (entries < ops->gc_thresh)
-		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
+		atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
 out:
-	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
+	val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
+	atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
 	return entries > rt_max_size;
 }
 
@@ -6358,7 +6359,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
 
-	net->ipv6.ip6_rt_gc_expire = 30*HZ;
+	atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
 
 	ret = 0;
 out:
-- 
Gitee


From 6007bb2484ce25d53b3df5fb1dba2aea01cef119 Mon Sep 17 00:00:00 2001
From: Xu Jia <xujia39@huawei.com>
Date: Tue, 26 Jul 2022 17:38:38 +0800
Subject: [PATCH 041/132] ipv6: fix kabi for ip6_rt_gc_expire in struct
 netns_ipv6

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

--------------------------------

Making ip6_rt_gc_expire atomic breaks the KABI of struct netns_ipv6.
This patch uses KABI_REPLACE to fix it.

Signed-off-by: Xu Jia <xujia39@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 include/net/netns/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 35e0c4a6c71f..31d0cf3c7377 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -79,7 +79,7 @@ struct netns_ipv6 {
 	struct dst_ops		ip6_dst_ops;
 	rwlock_t		fib6_walker_lock;
 	spinlock_t		fib6_gc_lock;
-	atomic_t		ip6_rt_gc_expire;
+	KABI_REPLACE(unsigned int ip6_rt_gc_expire, atomic_t ip6_rt_gc_expire)
 	unsigned long		ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	unsigned int		fib6_rules_require_fldissect;
-- 
Gitee


From d7434e9cbc0f809add85594b97bcd8731050cfe9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Jul 2022 17:38:39 +0800
Subject: [PATCH 042/132] netlink: reset network and mac headers in
 netlink_dump()

stable inclusion
from stable-v5.10.113
commit 3d55b195747c1259c994b868a509beae947a7c6e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3d55b195747c1259c994b868a509beae947a7c6e

--------------------------------

[ Upstream commit 99c07327ae11e24886d552dddbe4537bfca2765d ]

netlink_dump() is allocating an skb, reserves space in it
but forgets to reset network header.

This allows a BPF program, invoked later from sk_filter()
to access uninitialized kernel memory from the reserved
space.

Theorically mac header reset could be omitted, because
it is set to a special initial value.
bpf_internal_load_pointer_neg_helper calls skb_mac_header()
without checking skb_mac_header_was_set().
Relying on skb->len not being too big seems fragile.
We also could add a sanity check in bpf_internal_load_pointer_neg_helper()
to avoid surprises in the future.

syzbot report was:

BUG: KMSAN: uninit-value in ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637
 ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637
 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796
 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline]
 __bpf_prog_run include/linux/filter.h:626 [inline]
 bpf_prog_run include/linux/filter.h:633 [inline]
 __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756
 bpf_prog_run_save_cb include/linux/filter.h:770 [inline]
 sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150
 sk_filter include/linux/filter.h:905 [inline]
 netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276
 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002
 sock_recvmsg_nosec net/socket.c:948 [inline]
 sock_recvmsg net/socket.c:966 [inline]
 sock_read_iter+0x5a9/0x630 net/socket.c:1039
 do_iter_readv_writev+0xa7f/0xc70
 do_iter_read+0x52c/0x14c0 fs/read_write.c:786
 vfs_readv fs/read_write.c:906 [inline]
 do_readv+0x432/0x800 fs/read_write.c:943
 __do_sys_readv fs/read_write.c:1034 [inline]
 __se_sys_readv fs/read_write.c:1031 [inline]
 __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Uninit was stored to memory at:
 ___bpf_prog_run+0x96c/0xb420 kernel/bpf/core.c:1558
 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796
 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline]
 __bpf_prog_run include/linux/filter.h:626 [inline]
 bpf_prog_run include/linux/filter.h:633 [inline]
 __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756
 bpf_prog_run_save_cb include/linux/filter.h:770 [inline]
 sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150
 sk_filter include/linux/filter.h:905 [inline]
 netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276
 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002
 sock_recvmsg_nosec net/socket.c:948 [inline]
 sock_recvmsg net/socket.c:966 [inline]
 sock_read_iter+0x5a9/0x630 net/socket.c:1039
 do_iter_readv_writev+0xa7f/0xc70
 do_iter_read+0x52c/0x14c0 fs/read_write.c:786
 vfs_readv fs/read_write.c:906 [inline]
 do_readv+0x432/0x800 fs/read_write.c:943
 __do_sys_readv fs/read_write.c:1034 [inline]
 __se_sys_readv fs/read_write.c:1031 [inline]
 __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Uninit was created at:
 slab_post_alloc_hook mm/slab.h:737 [inline]
 slab_alloc_node mm/slub.c:3244 [inline]
 __kmalloc_node_track_caller+0xde3/0x14f0 mm/slub.c:4972
 kmalloc_reserve net/core/skbuff.c:354 [inline]
 __alloc_skb+0x545/0xf90 net/core/skbuff.c:426
 alloc_skb include/linux/skbuff.h:1158 [inline]
 netlink_dump+0x30f/0x16c0 net/netlink/af_netlink.c:2242
 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002
 sock_recvmsg_nosec net/socket.c:948 [inline]
 sock_recvmsg net/socket.c:966 [inline]
 sock_read_iter+0x5a9/0x630 net/socket.c:1039
 do_iter_readv_writev+0xa7f/0xc70
 do_iter_read+0x52c/0x14c0 fs/read_write.c:786
 vfs_readv fs/read_write.c:906 [inline]
 do_readv+0x432/0x800 fs/read_write.c:943
 __do_sys_readv fs/read_write.c:1034 [inline]
 __se_sys_readv fs/read_write.c:1031 [inline]
 __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x44/0xae

CPU: 0 PID: 3470 Comm: syz-executor751 Not tainted 5.17.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: db65a3aaf29e ("netlink: Trim skb to alloc size to avoid MSG_TRUNC")
Fixes: 9063e21fb026 ("netlink: autosize skb lengthes")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20220415181442.551228-1-eric.dumazet@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/netlink/af_netlink.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f37916156ca5..cbfb601c4ee9 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2276,6 +2276,13 @@ static int netlink_dump(struct sock *sk)
 	 * single netdev. The outcome is MSG_TRUNC error.
 	 */
 	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
+
+	/* Make sure malicious BPF programs can not read unitialized memory
+	 * from skb->head -> skb->data
+	 */
+	skb_reset_network_header(skb);
+	skb_reset_mac_header(skb);
+
 	netlink_skb_set_owner_r(skb, sk);
 
 	if (nlk->dump_done_errno > 0) {
-- 
Gitee


From ceaf8ddd59c3450db33261efc14a28835bafb9ce Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Tue, 26 Jul 2022 17:38:40 +0800
Subject: [PATCH 043/132] net: stmmac: Use readl_poll_timeout_atomic() in
 atomic state

stable inclusion
from stable-v5.10.113
commit f593f49fcd174ebe0ebeebecf46fadc134593ddb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f593f49fcd174ebe0ebeebecf46fadc134593ddb

--------------------------------

[ Upstream commit 234901de2bc6847eaa0aeb4aba62c31ffb8d3ad6 ]

The init_systime() may be invoked in atomic state. We have observed the
following call trace when running "phc_ctl /dev/ptp0 set" on a Intel
Agilex board.
  BUG: sleeping function called from invalid context at drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c:74
  in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 381, name: phc_ctl
  preempt_count: 1, expected: 0
  RCU nest depth: 0, expected: 0
  Preemption disabled at:
  [<ffff80000892ef78>] stmmac_set_time+0x34/0x8c
  CPU: 2 PID: 381 Comm: phc_ctl Not tainted 5.18.0-rc2-next-20220414-yocto-standard+ #567
  Hardware name: SoCFPGA Agilex SoCDK (DT)
  Call trace:
   dump_backtrace.part.0+0xc4/0xd0
   show_stack+0x24/0x40
   dump_stack_lvl+0x7c/0xa0
   dump_stack+0x18/0x34
   __might_resched+0x154/0x1c0
   __might_sleep+0x58/0x90
   init_systime+0x78/0x120
   stmmac_set_time+0x64/0x8c
   ptp_clock_settime+0x60/0x9c
   pc_clock_settime+0x6c/0xc0
   __arm64_sys_clock_settime+0x88/0xf0
   invoke_syscall+0x5c/0x130
   el0_svc_common.constprop.0+0x4c/0x100
   do_el0_svc+0x7c/0xa0
   el0_svc+0x58/0xcc
   el0t_64_sync_handler+0xa4/0x130
   el0t_64_sync+0x18c/0x190

So we should use readl_poll_timeout_atomic() here instead of
readl_poll_timeout().

Also adjust the delay time to 10us to fix a "__bad_udelay" build error
reported by "kernel test robot <lkp@intel.com>". I have tested this on
Intel Agilex and NXP S32G boards, there is no delay needed at all.
So the 10us delay should be long enough for most cases.

Fixes: ff8ed737860e ("net: stmmac: use readl_poll_timeout() function in init_systime()")
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
index 07b1b8374cd2..53efcc9c40e2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
@@ -68,9 +68,9 @@ static int init_systime(void __iomem *ioaddr, u32 sec, u32 nsec)
 	writel(value, ioaddr + PTP_TCR);
 
 	/* wait for present system time initialize to complete */
-	return readl_poll_timeout(ioaddr + PTP_TCR, value,
+	return readl_poll_timeout_atomic(ioaddr + PTP_TCR, value,
 				 !(value & PTP_TCR_TSINIT),
-				 10000, 100000);
+				 10, 100000);
 }
 
 static int config_addend(void __iomem *ioaddr, u32 addend)
-- 
Gitee


From 1d394f14128d12cf5b6313d895622aad5fd5a5ba Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Jul 2022 17:38:41 +0800
Subject: [PATCH 044/132] dmaengine: idxd: add RO check for wq max_batch_size
 write

stable inclusion
from stable-v5.10.113
commit 9a3c026dc3a59fa23c63d90752f07538a3c75646
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a3c026dc3a59fa23c63d90752f07538a3c75646

--------------------------------

[ Upstream commit 66903461ffed0b66fc3e0200082d4e09365aacdc ]

Block wq_max_batch_size_store() when the device is configured as read-only
and not configurable.

Fixes: e7184b159dd3 ("dmaengine: idxd: add support for configurable max wq batch size")
Reported-by: Bernice Zhang <bernice.zhang@intel.com>
Tested-by: Bernice Zhang <bernice.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164971493551.2201159.1942042593642155209.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/idxd/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 7b41cdff1a2c..5bf4b4be64e4 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1132,6 +1132,9 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu
 	u64 batch_size;
 	int rc;
 
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
 
-- 
Gitee


From c324bed6bcd3c12392271dddbbbd03c23acddd8d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 26 Jul 2022 17:38:42 +0800
Subject: [PATCH 045/132] dmaengine: idxd: add RO check for wq
 max_transfer_size write

stable inclusion
from stable-v5.10.113
commit 520aab8b723cb2bffef957844b634af04e96428d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=520aab8b723cb2bffef957844b634af04e96428d

--------------------------------

[ Upstream commit 505a2d1032ae656b0a8c736be110255503941cde ]

Block wq_max_transfer_size_store() when the device is configured as
read-only and not configurable.

Fixes: d7aad5550eca ("dmaengine: idxd: add support for configurable max wq xfer size")
Reported-by: Bernice Zhang <bernice.zhang@intel.com>
Tested-by: Bernice Zhang <bernice.zhang@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164971488154.2200913.10706665404118545941.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/idxd/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 5bf4b4be64e4..51af0dfc3c63 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1098,6 +1098,9 @@ static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attr
 	u64 xfer_size;
 	int rc;
 
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
 	if (wq->state != IDXD_WQ_DISABLED)
 		return -EPERM;
 
-- 
Gitee


From aec8147faa831c7bd51628f95d992afe0942c107 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 26 Jul 2022 17:38:43 +0800
Subject: [PATCH 046/132] selftests: mlxsw: vxlan_flooding: Prevent flooding of
 unwanted packets

stable inclusion
from stable-v5.10.113
commit 3bf8ca35017024fa1cad55344f798cd5cd131c16
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3bf8ca35017024fa1cad55344f798cd5cd131c16

--------------------------------

[ Upstream commit 044011fdf162c5dd61c02841930c8f438a9adadb ]

The test verifies that packets are correctly flooded by the bridge and
the VXLAN device by matching on the encapsulated packets at the other
end. However, if packets other than those generated by the test also
ingress the bridge (e.g., MLD packets), they will be flooded as well and
interfere with the expected count.

Make the test more robust by making sure that only the packets generated
by the test can ingress the bridge. Drop all the rest using tc filters
on the egress of 'br0' and 'h1'.

In the software data path, the problem can be solved by matching on the
inner destination MAC or dropping unwanted packets at the egress of the
VXLAN device, but this is not currently supported by mlxsw.

Fixes: 94d302deae25 ("selftests: mlxsw: Add a test for VxLAN flooding")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Amit Cohen <amcohen@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 .../drivers/net/mlxsw/vxlan_flooding.sh         | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh
index fedcb7b35af9..af5ea50ed5c0 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh
@@ -172,6 +172,17 @@ flooding_filters_add()
 	local lsb
 	local i
 
+	# Prevent unwanted packets from entering the bridge and interfering
+	# with the test.
+	tc qdisc add dev br0 clsact
+	tc filter add dev br0 egress protocol all pref 1 handle 1 \
+		matchall skip_hw action drop
+	tc qdisc add dev $h1 clsact
+	tc filter add dev $h1 egress protocol all pref 1 handle 1 \
+		flower skip_hw dst_mac de:ad:be:ef:13:37 action pass
+	tc filter add dev $h1 egress protocol all pref 2 handle 2 \
+		matchall skip_hw action drop
+
 	tc qdisc add dev $rp2 clsact
 
 	for i in $(eval echo {1..$num_remotes}); do
@@ -194,6 +205,12 @@ flooding_filters_del()
 	done
 
 	tc qdisc del dev $rp2 clsact
+
+	tc filter del dev $h1 egress protocol all pref 2 handle 2 matchall
+	tc filter del dev $h1 egress protocol all pref 1 handle 1 flower
+	tc qdisc del dev $h1 clsact
+	tc filter del dev br0 egress protocol all pref 1 handle 1 matchall
+	tc qdisc del dev br0 clsact
 }
 
 flooding_check_packets()
-- 
Gitee


From a260b077acedec14a79fb4dc4e06619ac344af7f Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 26 Jul 2022 17:38:44 +0800
Subject: [PATCH 047/132] arm64/mm: Remove [PUD|PMD]_TABLE_BIT from
 [pud|pmd]_bad()

stable inclusion
from stable-v5.10.113
commit 18ff7a2efa4e14a6bd86aad31a6a8cebdf0dbabc
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=18ff7a2efa4e14a6bd86aad31a6a8cebdf0dbabc

--------------------------------

[ Upstream commit e377ab82311af95c99648c6424a6b888a0ccb102 ]

Semantics wise, [pud|pmd]_bad() have always implied that a given [PUD|PMD]
entry does not have a pointer to the next level page table. This had been
made clear in the commit a1c76574f345 ("arm64: mm: use *_sect to check for
section maps"). Hence explicitly check for a table entry rather than just
testing a single bit. This basically redefines [pud|pmd]_bad() in terms of
[pud|pmd]_table() making the semantics clear.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/1620644871-26280-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm64/include/asm/pgtable.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index daed33697d98..19d18bca6958 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -516,13 +516,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
 
-#define pmd_bad(pmd)		(!(pmd_val(pmd) & PMD_TABLE_BIT))
-
 #define pmd_table(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_SECT)
 #define pmd_leaf(pmd)		pmd_sect(pmd)
+#define pmd_bad(pmd)		(!pmd_table(pmd))
 
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
 static inline bool pud_sect(pud_t pud) { return false; }
@@ -606,7 +605,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 	pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
 
 #define pud_none(pud)		(!pud_val(pud))
-#define pud_bad(pud)		(!(pud_val(pud) & PUD_TABLE_BIT))
+#define pud_bad(pud)		(!pud_table(pud))
 #define pud_present(pud)	pte_present(pud_pte(pud))
 #define pud_leaf(pud)		pud_sect(pud)
 #define pud_valid(pud)		pte_valid(pud_pte(pud))
-- 
Gitee


From 286861f72d40a9d40e51210f620f52c090483b8a Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 26 Jul 2022 17:38:45 +0800
Subject: [PATCH 048/132] arm64: mm: fix p?d_leaf()

stable inclusion
from stable-v5.10.113
commit 052e4a661f90d27586b4364764dcfb2c022b00ee
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=052e4a661f90d27586b4364764dcfb2c022b00ee

--------------------------------

[ Upstream commit 23bc8f69f0eceecbb87c3801d2e48827d2dca92b ]

The pmd_leaf() is used to test a leaf mapped PMD, however, it misses
the PROT_NONE mapped PMD on arm64.  Fix it.  A real world issue [1]
caused by this was reported by Qian Cai. Also fix pud_leaf().

Link: https://patchwork.kernel.org/comment/24798260/ [1]
Fixes: 8aa82df3c123 ("arm64: mm: add p?d_leaf() definitions")
Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Link: https://lore.kernel.org/r/20220422060033.48711-1-songmuchun@bytedance.com
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm64/include/asm/pgtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 19d18bca6958..ab2443900f4e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -520,7 +520,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_SECT)
-#define pmd_leaf(pmd)		pmd_sect(pmd)
+#define pmd_leaf(pmd)		(pmd_present(pmd) && !pmd_table(pmd))
 #define pmd_bad(pmd)		(!pmd_table(pmd))
 
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
@@ -607,7 +607,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 #define pud_none(pud)		(!pud_val(pud))
 #define pud_bad(pud)		(!pud_table(pud))
 #define pud_present(pud)	pte_present(pud_pte(pud))
-#define pud_leaf(pud)		pud_sect(pud)
+#define pud_leaf(pud)		(pud_present(pud) && !pud_table(pud))
 #define pud_valid(pud)		pte_valid(pud_pte(pud))
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
-- 
Gitee


From be31be301f9aa6457bb1cd1ac8ba374db3ecba79 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 26 Jul 2022 17:38:46 +0800
Subject: [PATCH 049/132] ARM: vexpress/spc: Avoid negative array index when
 !SMP

stable inclusion
from stable-v5.10.113
commit d513ea9b7ef822b638857c7114d26af108e4bab1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d513ea9b7ef822b638857c7114d26af108e4bab1

--------------------------------

[ Upstream commit b3f1dd52c991d79118f35e6d1bf4d7cb09882e38 ]

When building multi_v7_defconfig+CONFIG_SMP=n, -Warray-bounds exposes
a couple negative array index accesses:

arch/arm/mach-vexpress/spc.c: In function 've_spc_clk_init':
arch/arm/mach-vexpress/spc.c:583:21: warning: array subscript -1 is below array bounds of 'bool[2]' {aka '_Bool[2]'} [-Warray-bounds]
  583 |   if (init_opp_table[cluster])
      |       ~~~~~~~~~~~~~~^~~~~~~~~
arch/arm/mach-vexpress/spc.c:556:7: note: while referencing 'init_opp_table'
  556 |  bool init_opp_table[MAX_CLUSTERS] = { false };
      |       ^~~~~~~~~~~~~~
arch/arm/mach-vexpress/spc.c:592:18: warning: array subscript -1 is below array bounds of 'bool[2]' {aka '_Bool[2]'} [-Warray-bounds]
  592 |    init_opp_table[cluster] = true;
      |    ~~~~~~~~~~~~~~^~~~~~~~~
arch/arm/mach-vexpress/spc.c:556:7: note: while referencing 'init_opp_table'
  556 |  bool init_opp_table[MAX_CLUSTERS] = { false };
      |       ^~~~~~~~~~~~~~

Skip this logic when built !SMP.

Link: https://lore.kernel.org/r/20220331190443.851661-1-keescook@chromium.org
Cc: Liviu Dudau <liviu.dudau@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Acked-by: Liviu Dudau <liviu.dudau@arm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm/mach-vexpress/spc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mach-vexpress/spc.c b/arch/arm/mach-vexpress/spc.c
index 1da11bdb1dfb..1c6500c4e6a1 100644
--- a/arch/arm/mach-vexpress/spc.c
+++ b/arch/arm/mach-vexpress/spc.c
@@ -580,7 +580,7 @@ static int __init ve_spc_clk_init(void)
 		}
 
 		cluster = topology_physical_package_id(cpu_dev->id);
-		if (init_opp_table[cluster])
+		if (cluster < 0 || init_opp_table[cluster])
 			continue;
 
 		if (ve_init_opp_table(cpu_dev))
-- 
Gitee


From c8a5122e572911c692240f7a894883779217a520 Mon Sep 17 00:00:00 2001
From: Sameer Pujar <spujar@nvidia.com>
Date: Tue, 26 Jul 2022 17:38:47 +0800
Subject: [PATCH 050/132] reset: tegra-bpmp: Restore Handle errors in BPMP
 response

stable inclusion
from stable-v5.10.113
commit cb17b56a9b4de46c057691b524c692c10e8d9d51
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cb17b56a9b4de46c057691b524c692c10e8d9d51

--------------------------------

[ Upstream commit d1da1052ffad63aa5181b69f20a6952e31f339c2 ]

This reverts following commit 69125b4b9440 ("reset: tegra-bpmp: Revert
Handle errors in BPMP response").

The Tegra194 HDA reset failure is fixed by commit d278dc9151a0 ("ALSA:
hda/tegra: Fix Tegra194 HDA reset failure"). The temporary revert of
original commit c045ceb5a145 ("reset: tegra-bpmp: Handle errors in BPMP
response") can be removed now.

Signed-off-by: Sameer Pujar <spujar@nvidia.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Link: https://lore.kernel.org/r/1641995806-15245-1-git-send-email-spujar@nvidia.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/reset/tegra/reset-bpmp.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/reset/tegra/reset-bpmp.c b/drivers/reset/tegra/reset-bpmp.c
index 24d3395964cc..4c5bba52b105 100644
--- a/drivers/reset/tegra/reset-bpmp.c
+++ b/drivers/reset/tegra/reset-bpmp.c
@@ -20,6 +20,7 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc,
 	struct tegra_bpmp *bpmp = to_tegra_bpmp(rstc);
 	struct mrq_reset_request request;
 	struct tegra_bpmp_message msg;
+	int err;
 
 	memset(&request, 0, sizeof(request));
 	request.cmd = command;
@@ -30,7 +31,13 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc,
 	msg.tx.data = &request;
 	msg.tx.size = sizeof(request);
 
-	return tegra_bpmp_transfer(bpmp, &msg);
+	err = tegra_bpmp_transfer(bpmp, &msg);
+	if (err)
+		return err;
+	if (msg.rx.ret)
+		return -EINVAL;
+
+	return 0;
 }
 
 static int tegra_bpmp_reset_module(struct reset_controller_dev *rstc,
-- 
Gitee


From f0894049e9ec662a3fbb60d84f3124663a1efb9b Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Tue, 26 Jul 2022 17:38:48 +0800
Subject: [PATCH 051/132] platform/x86: samsung-laptop: Fix an unsigned
 comparison which can never be negative

stable inclusion
from stable-v5.10.113
commit 490815f0b50e42238a22ea4b2dc64105f5a59e6f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=490815f0b50e42238a22ea4b2dc64105f5a59e6f

--------------------------------

[ Upstream commit 0284d4d1be753f648f28b77bdfbe6a959212af5c ]

Eliminate the follow smatch warnings:

drivers/platform/x86/samsung-laptop.c:1124 kbd_led_set() warn: unsigned
'value' is never less than zero.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220322061830.105579-1-jiapeng.chong@linux.alibaba.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/platform/x86/samsung-laptop.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c
index d5cec6e35bb8..0e456c39a603 100644
--- a/drivers/platform/x86/samsung-laptop.c
+++ b/drivers/platform/x86/samsung-laptop.c
@@ -1121,8 +1121,6 @@ static void kbd_led_set(struct led_classdev *led_cdev,
 
 	if (value > samsung->kbd_led.max_brightness)
 		value = samsung->kbd_led.max_brightness;
-	else if (value < 0)
-		value = 0;
 
 	samsung->kbd_led_wk = value;
 	queue_work(samsung->led_workqueue, &samsung->kbd_led_work);
-- 
Gitee


From 45f7650be7ed083d437c26a46a5148ee50c5641b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jul 2022 17:38:49 +0800
Subject: [PATCH 052/132] ALSA: usb-audio: Fix undefined behavior due to shift
 overflowing the constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit cd227ac03f2aa918ee3672fe2a5960dd0e7be331
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cd227ac03f2aa918ee3672fe2a5960dd0e7be331

--------------------------------

[ Upstream commit 1ef8715975de8bd481abbd0839ed4f49d9e5b0ff ]

Fix:

  sound/usb/midi.c: In function ‘snd_usbmidi_out_endpoint_create’:
  sound/usb/midi.c:1389:2: error: case label does not reduce to an integer constant
    case USB_ID(0xfc08, 0x0101): /* Unknown vendor Cable */
    ^~~~

See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory
details as to why it triggers with older gccs only.

[ A slight correction with parentheses around the argument by tiwai ]

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220405151517.29753-3-bp@alien8.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/usb/usbaudio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h
index e54a98f46549..d8e31ee03b9d 100644
--- a/sound/usb/usbaudio.h
+++ b/sound/usb/usbaudio.h
@@ -8,7 +8,7 @@
  */
 
 /* handling of USB vendor/product ID pairs as 32-bit numbers */
-#define USB_ID(vendor, product) (((vendor) << 16) | (product))
+#define USB_ID(vendor, product) (((unsigned int)(vendor) << 16) | (product))
 #define USB_ID_VENDOR(id) ((id) >> 16)
 #define USB_ID_PRODUCT(id) ((u16)(id))
 
-- 
Gitee


From bade8d4a2dbd0d43173f01e1a0ec525814c13f44 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 26 Jul 2022 17:38:50 +0800
Subject: [PATCH 053/132] arm64: dts: imx: Fix imx8*-var-som touchscreen
 property sizes

stable inclusion
from stable-v5.10.113
commit 8e7ea11364758d43e577b7835b8e98f27927d56c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8e7ea11364758d43e577b7835b8e98f27927d56c

--------------------------------

[ Upstream commit 1bc12d301594eafde0a8529d28d459af81053b3a ]

The common touchscreen properties are all 32-bit, not 16-bit. These
properties must not be too important as they are all ignored in case of an
error reading them.

Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/Yk3moe6Hz8ELM0iS@robh.at.kernel.org'
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi | 8 ++++----
 arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi
index 49082529764f..0fac1f3f7f47 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi
@@ -89,12 +89,12 @@ touchscreen@0 {
 		pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>;
 
 		ti,x-min = /bits/ 16 <125>;
-		touchscreen-size-x = /bits/ 16 <4008>;
+		touchscreen-size-x = <4008>;
 		ti,y-min = /bits/ 16 <282>;
-		touchscreen-size-y = /bits/ 16 <3864>;
+		touchscreen-size-y = <3864>;
 		ti,x-plate-ohms = /bits/ 16 <180>;
-		touchscreen-max-pressure = /bits/ 16 <255>;
-		touchscreen-average-samples = /bits/ 16 <10>;
+		touchscreen-max-pressure = <255>;
+		touchscreen-average-samples = <10>;
 		ti,debounce-tol = /bits/ 16 <3>;
 		ti,debounce-rep = /bits/ 16 <1>;
 		ti,settle-delay-usec = /bits/ 16 <150>;
diff --git a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi
index 7f356edf9f91..f6287f174355 100644
--- a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi
@@ -70,12 +70,12 @@ touchscreen@0 {
 		pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>;
 
 		ti,x-min = /bits/ 16 <125>;
-		touchscreen-size-x = /bits/ 16 <4008>;
+		touchscreen-size-x = <4008>;
 		ti,y-min = /bits/ 16 <282>;
-		touchscreen-size-y = /bits/ 16 <3864>;
+		touchscreen-size-y = <3864>;
 		ti,x-plate-ohms = /bits/ 16 <180>;
-		touchscreen-max-pressure = /bits/ 16 <255>;
-		touchscreen-average-samples = /bits/ 16 <10>;
+		touchscreen-max-pressure = <255>;
+		touchscreen-average-samples = <10>;
 		ti,debounce-tol = /bits/ 16 <3>;
 		ti,debounce-rep = /bits/ 16 <1>;
 		ti,settle-delay-usec = /bits/ 16 <150>;
-- 
Gitee


From b9a0a6d062e89d410cd33f03b17ed7840e096128 Mon Sep 17 00:00:00 2001
From: Hongbin Wang <wh_bin@126.com>
Date: Tue, 26 Jul 2022 17:38:51 +0800
Subject: [PATCH 054/132] vxlan: fix error return code in vxlan_fdb_append

stable inclusion
from stable-v5.10.113
commit e129c55153c8aba74e4c73451399f12dacf6347d
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e129c55153c8aba74e4c73451399f12dacf6347d

--------------------------------

[ Upstream commit 7cea5560bf656b84f9ed01c0cc829d4eecd0640b ]

When kmalloc and dst_cache_init failed,
should return ENOMEM rather than ENOBUFS.

Signed-off-by: Hongbin Wang <wh_bin@126.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/vxlan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c362a6ac94c4..73953b0141ba 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -710,11 +710,11 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 
 	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
 	if (rd == NULL)
-		return -ENOBUFS;
+		return -ENOMEM;
 
 	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
 		kfree(rd);
-		return -ENOBUFS;
+		return -ENOMEM;
 	}
 
 	rd->remote_ip = *ip;
-- 
Gitee


From 62f154bc19641ca00a8b926c5d28d3d1271eea1c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 26 Jul 2022 17:38:52 +0800
Subject: [PATCH 055/132] cifs: Check the IOCB_DIRECT flag, not O_DIRECT

stable inclusion
from stable-v5.10.113
commit 5bef9fc38ffa3e0a712fcd1e07f4f123ba160242
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5bef9fc38ffa3e0a712fcd1e07f4f123ba160242

--------------------------------

[ Upstream commit 994fd530a512597ffcd713b0f6d5bc916c5698f0 ]

Use the IOCB_DIRECT indicator flag on the I/O context rather than checking to
see if the file was opened O_DIRECT.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Shyam Prasad N <nspmangalore@gmail.com>
cc: Rohith Surabattula <rohiths.msft@gmail.com>
cc: linux-cifs@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/cifs/cifsfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index aa5a4d759ca2..370188b2a55d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -898,7 +898,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t rc;
 	struct inode *inode = file_inode(iocb->ki_filp);
 
-	if (iocb->ki_filp->f_flags & O_DIRECT)
+	if (iocb->ki_flags & IOCB_DIRECT)
 		return cifs_user_readv(iocb, iter);
 
 	rc = cifs_revalidate_mapping(inode);
-- 
Gitee


From 652cce7aeaa6bf232f67c210d878f48733c34045 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Tue, 26 Jul 2022 17:38:53 +0800
Subject: [PATCH 056/132] net: atlantic: Avoid out-of-bounds indexing

stable inclusion
from stable-v5.10.113
commit 0de9c104d04aed5e464bd788abb423ed333980c7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0de9c104d04aed5e464bd788abb423ed333980c7

--------------------------------

[ Upstream commit 8d3a6c37d50d5a0504c126c932cc749e6dd9c78f ]

UBSAN warnings are observed on atlantic driver:
[ 294.432996] UBSAN: array-index-out-of-bounds in /build/linux-Qow4fL/linux-5.15.0/drivers/net/ethernet/aquantia/atlantic/aq_nic.c:484:48
[ 294.433695] index 8 is out of range for type 'aq_vec_s *[8]'

The ring is dereferenced right before breaking out the loop, to prevent
that from happening, only use the index in the loop to fix the issue.

BugLink: https://bugs.launchpad.net/bugs/1958770
Tested-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reviewed-by: Igor Russkikh <irusskikh@marvell.com>
Link: https://lore.kernel.org/r/20220408022204.16815-1-kai.heng.feng@canonical.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 .../net/ethernet/aquantia/atlantic/aq_nic.c   |  8 +++----
 .../net/ethernet/aquantia/atlantic/aq_vec.c   | 24 +++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 0cf8ae8aeac8..2fb4126ae8d8 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -480,8 +480,8 @@ int aq_nic_start(struct aq_nic_s *self)
 	if (err < 0)
 		goto err_exit;
 
-	for (i = 0U, aq_vec = self->aq_vec[0];
-		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) {
+	for (i = 0U; self->aq_vecs > i; ++i) {
+		aq_vec = self->aq_vec[i];
 		err = aq_vec_start(aq_vec);
 		if (err < 0)
 			goto err_exit;
@@ -511,8 +511,8 @@ int aq_nic_start(struct aq_nic_s *self)
 		mod_timer(&self->polling_timer, jiffies +
 			  AQ_CFG_POLLING_TIMER_INTERVAL);
 	} else {
-		for (i = 0U, aq_vec = self->aq_vec[0];
-			self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) {
+		for (i = 0U; self->aq_vecs > i; ++i) {
+			aq_vec = self->aq_vec[i];
 			err = aq_pci_func_alloc_irq(self, i, self->ndev->name,
 						    aq_vec_isr, aq_vec,
 						    aq_vec_get_affinity_mask(aq_vec));
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
index f4774cf051c9..6ab1f3212d24 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
@@ -43,8 +43,8 @@ static int aq_vec_poll(struct napi_struct *napi, int budget)
 	if (!self) {
 		err = -EINVAL;
 	} else {
-		for (i = 0U, ring = self->ring[0];
-			self->tx_rings > i; ++i, ring = self->ring[i]) {
+		for (i = 0U; self->tx_rings > i; ++i) {
+			ring = self->ring[i];
 			u64_stats_update_begin(&ring[AQ_VEC_RX_ID].stats.rx.syncp);
 			ring[AQ_VEC_RX_ID].stats.rx.polls++;
 			u64_stats_update_end(&ring[AQ_VEC_RX_ID].stats.rx.syncp);
@@ -182,8 +182,8 @@ int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops,
 	self->aq_hw_ops = aq_hw_ops;
 	self->aq_hw = aq_hw;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		err = aq_ring_init(&ring[AQ_VEC_TX_ID], ATL_RING_TX);
 		if (err < 0)
 			goto err_exit;
@@ -224,8 +224,8 @@ int aq_vec_start(struct aq_vec_s *self)
 	unsigned int i = 0U;
 	int err = 0;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		err = self->aq_hw_ops->hw_ring_tx_start(self->aq_hw,
 							&ring[AQ_VEC_TX_ID]);
 		if (err < 0)
@@ -248,8 +248,8 @@ void aq_vec_stop(struct aq_vec_s *self)
 	struct aq_ring_s *ring = NULL;
 	unsigned int i = 0U;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		self->aq_hw_ops->hw_ring_tx_stop(self->aq_hw,
 						 &ring[AQ_VEC_TX_ID]);
 
@@ -268,8 +268,8 @@ void aq_vec_deinit(struct aq_vec_s *self)
 	if (!self)
 		goto err_exit;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]);
 		aq_ring_rx_deinit(&ring[AQ_VEC_RX_ID]);
 	}
@@ -297,8 +297,8 @@ void aq_vec_ring_free(struct aq_vec_s *self)
 	if (!self)
 		goto err_exit;
 
-	for (i = 0U, ring = self->ring[0];
-		self->tx_rings > i; ++i, ring = self->ring[i]) {
+	for (i = 0U; self->tx_rings > i; ++i) {
+		ring = self->ring[i];
 		aq_ring_free(&ring[AQ_VEC_TX_ID]);
 		if (i < self->rx_rings)
 			aq_ring_free(&ring[AQ_VEC_RX_ID]);
-- 
Gitee


From e10c83e7cbb797c3b2d7dbe6c704fa84cebf6fc6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jul 2022 17:38:54 +0800
Subject: [PATCH 057/132] mt76: Fix undefined behavior due to shift overflowing
 the constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit 202748f441488e08c730736bf5def1ae08f16811
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=202748f441488e08c730736bf5def1ae08f16811

--------------------------------

[ Upstream commit dbc2b1764734857d68425468ffa8486e97ab89df ]

Fix:

  drivers/net/wireless/mediatek/mt76/mt76x2/pci.c: In function ‘mt76x2e_probe’:
  ././include/linux/compiler_types.h:352:38: error: call to ‘__compiletime_assert_946’ \
	declared with attribute error: FIELD_PREP: mask is not constant
    _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)

See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory
details as to why it triggers with older gccs only.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Felix Fietkau <nbd@nbd.name>
Cc: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Cc: Ryder Lee <ryder.lee@mediatek.com>
Cc: Shayne Chen <shayne.chen@mediatek.com>
Cc: Sean Wang <sean.wang@mediatek.com>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220405151517.29753-9-bp@alien8.de
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/wireless/mediatek/mt76/mt76x2/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c
index ecaf85b483ac..e57e49a722dc 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c
@@ -80,7 +80,7 @@ mt76x2e_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mt76_rmw_field(dev, 0x15a10, 0x1f << 16, 0x9);
 
 	/* RG_SSUSB_G1_CDR_BIC_LTR = 0xf */
-	mt76_rmw_field(dev, 0x15a0c, 0xf << 28, 0xf);
+	mt76_rmw_field(dev, 0x15a0c, 0xfU << 28, 0xf);
 
 	/* RG_SSUSB_CDR_BR_PE1D = 0x3 */
 	mt76_rmw_field(dev, 0x15c58, 0x3 << 6, 0x3);
-- 
Gitee


From d89078f497c0d04cb4ec777eca7bc4a451c68b00 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Tue, 26 Jul 2022 17:38:55 +0800
Subject: [PATCH 058/132] brcmfmac: sdio: Fix undefined behavior due to shift
 overflowing the constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit b3afe5a7fd7548fa3a818f8dd20e22db4b8db019
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b3afe5a7fd7548fa3a818f8dd20e22db4b8db019

--------------------------------

[ Upstream commit 6fb3a5868b2117611f41e421e10e6a8c2a13039a ]

Fix:

  drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c: In function ‘brcmf_sdio_drivestrengthinit’:
  drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:3798:2: error: case label does not reduce to an integer constant
    case SDIOD_DRVSTR_KEY(BRCM_CC_43143_CHIP_ID, 17):
    ^~~~
  drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:3809:2: error: case label does not reduce to an integer constant
    case SDIOD_DRVSTR_KEY(BRCM_CC_43362_CHIP_ID, 13):
    ^~~~

See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory
details as to why it triggers with older gccs only.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arend van Spriel <aspriel@gmail.com>
Cc: Franky Lin <franky.lin@broadcom.com>
Cc: Hante Meuleman <hante.meuleman@broadcom.com>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: brcm80211-dev-list.pdl@broadcom.com
Cc: netdev@vger.kernel.org
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/Ykx0iRlvtBnKqtbG@zn.tnic
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 6d5d5c39c635..9929e90866f0 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -557,7 +557,7 @@ enum brcmf_sdio_frmtype {
 	BRCMF_SDIO_FT_SUB,
 };
 
-#define SDIOD_DRVSTR_KEY(chip, pmu)     (((chip) << 16) | (pmu))
+#define SDIOD_DRVSTR_KEY(chip, pmu)     (((unsigned int)(chip) << 16) | (pmu))
 
 /* SDIO Pad drive strength to select value mappings */
 struct sdiod_drive_str {
-- 
Gitee


From d1c00ee05775fa5b9a20a1392fe710e785fc68c9 Mon Sep 17 00:00:00 2001
From: Lv Ruyi <lv.ruyi@zte.com.cn>
Date: Tue, 26 Jul 2022 17:38:56 +0800
Subject: [PATCH 059/132] dpaa_eth: Fix missing of_node_put in
 dpaa_get_ts_info()

stable inclusion
from stable-v5.10.113
commit 8d71edabb0abe6c8e3e1601e73f63f81783b6cc0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8d71edabb0abe6c8e3e1601e73f63f81783b6cc0

--------------------------------

[ Upstream commit 1a7eb80d170c28be2928433702256fe2a0bd1e0f ]

Both of of_get_parent() and of_parse_phandle() return node pointer with
refcount incremented, use of_node_put() on it to decrease refcount
when done.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Lv Ruyi <lv.ruyi@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 763d2c7b5fb1..5750f9a56393 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -489,11 +489,15 @@ static int dpaa_get_ts_info(struct net_device *net_dev,
 	info->phc_index = -1;
 
 	fman_node = of_get_parent(mac_node);
-	if (fman_node)
+	if (fman_node) {
 		ptp_node = of_parse_phandle(fman_node, "ptimer-handle", 0);
+		of_node_put(fman_node);
+	}
 
-	if (ptp_node)
+	if (ptp_node) {
 		ptp_dev = of_find_device_by_node(ptp_node);
+		of_node_put(ptp_node);
+	}
 
 	if (ptp_dev)
 		ptp = platform_get_drvdata(ptp_dev);
-- 
Gitee


From 63fc6600e0de84218fd9587b1037e42025fb27c9 Mon Sep 17 00:00:00 2001
From: Xiaoke Wang <xkernel.wang@foxmail.com>
Date: Tue, 26 Jul 2022 17:38:57 +0800
Subject: [PATCH 060/132] drm/msm/mdp5: check the return of kzalloc()

stable inclusion
from stable-v5.10.113
commit 9581e07b549bd911ce3af12b57ea2646b4995ebf
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9581e07b549bd911ce3af12b57ea2646b4995ebf

--------------------------------

[ Upstream commit 047ae665577776b7feb11bd4f81f46627cff95e7 ]

kzalloc() is a memory allocation function which can return NULL when
some internal memory errors happen. So it is better to check it to
prevent potential wrong memory access.

Besides, since mdp5_plane_reset() is void type, so we should better
set `plane-state` to NULL after releasing it.

Signed-off-by: Xiaoke Wang <xkernel.wang@foxmail.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Patchwork: https://patchwork.freedesktop.org/patch/481055/
Link: https://lore.kernel.org/r/tencent_8E2A1C78140EE1784AB2FF4B2088CC0AB908@qq.com
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Rob Clark <robdclark@chromium.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
index 83423092de2f..da0799333970 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c
@@ -179,7 +179,10 @@ static void mdp5_plane_reset(struct drm_plane *plane)
 		drm_framebuffer_put(plane->state->fb);
 
 	kfree(to_mdp5_plane_state(plane->state));
+	plane->state = NULL;
 	mdp5_state = kzalloc(sizeof(*mdp5_state), GFP_KERNEL);
+	if (!mdp5_state)
+		return;
 
 	/* assign default blend parameters */
 	mdp5_state->alpha = 255;
-- 
Gitee


From 6f2cb02318c126edf72a06c12938b3bc93a7de1a Mon Sep 17 00:00:00 2001
From: Tomas Melin <tomas.melin@vaisala.com>
Date: Tue, 26 Jul 2022 17:38:58 +0800
Subject: [PATCH 061/132] net: macb: Restart tx only if queue pointer is
 lagging

stable inclusion
from stable-v5.10.113
commit a284cca3d81ae5e5f5f23c09b8298339188953a5
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a284cca3d81ae5e5f5f23c09b8298339188953a5

--------------------------------

[ Upstream commit 5ad7f18cd82cee8e773d40cc7a1465a526f2615c ]

commit 4298388574da ("net: macb: restart tx after tx used bit read")
added support for restarting transmission. Restarting tx does not work
in case controller asserts TXUBR interrupt and TQBP is already at the end
of the tx queue. In that situation, restarting tx will immediately cause
assertion of another TXUBR interrupt. The driver will end up in an infinite
interrupt loop which it cannot break out of.

For cases where TQBP is at the end of the tx queue, instead
only clear TX_USED interrupt. As more data gets pushed to the queue,
transmission will resume.

This issue was observed on a Xilinx Zynq-7000 based board.
During stress test of the network interface,
driver would get stuck on interrupt loop within seconds or minutes
causing CPU to stall.

Signed-off-by: Tomas Melin <tomas.melin@vaisala.com>
Tested-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Link: https://lore.kernel.org/r/20220407161659.14532-1-tomas.melin@vaisala.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 78c6d133f54f..3244f69555f7 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1531,6 +1531,7 @@ static void macb_tx_restart(struct macb_queue *queue)
 	unsigned int head = queue->tx_head;
 	unsigned int tail = queue->tx_tail;
 	struct macb *bp = queue->bp;
+	unsigned int head_idx, tbqp;
 
 	if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
 		queue_writel(queue, ISR, MACB_BIT(TXUBR));
@@ -1538,6 +1539,13 @@ static void macb_tx_restart(struct macb_queue *queue)
 	if (head == tail)
 		return;
 
+	tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp);
+	tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp));
+	head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head));
+
+	if (tbqp == head_idx)
+		return;
+
 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
 }
 
-- 
Gitee


From 0389f1ff99ad7375ff5980b964ab00267a72af25 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 26 Jul 2022 17:38:59 +0800
Subject: [PATCH 062/132] scsi: qedi: Fix failed disconnect handling

stable inclusion
from stable-v5.10.113
commit bf28bba3041055759ce6afe5e61bc4fd709ea6e4
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bf28bba3041055759ce6afe5e61bc4fd709ea6e4

--------------------------------

[ Upstream commit 857b06527f707f5df634b854898a191b5c1d0272 ]

We set the qedi_ep state to EP_STATE_OFLDCONN_START when the ep is
created. Then in qedi_set_path we kick off the offload work. If userspace
times out the connection and calls ep_disconnect, qedi will only flush the
offload work if the qedi_ep state has transitioned away from
EP_STATE_OFLDCONN_START. If we can't connect we will not have transitioned
state and will leave the offload work running, and we will free the qedi_ep
from under it.

This patch just has us init the work when we create the ep, then always
flush it.

Link: https://lore.kernel.org/r/20220408001314.5014-10-michael.christie@oracle.com
Tested-by: Manish Rangankar <mrangankar@marvell.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Chris Leech <cleech@redhat.com>
Acked-by: Manish Rangankar <mrangankar@marvell.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/scsi/qedi/qedi_iscsi.c | 69 +++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index ba9a22e55e32..8003b3519b95 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -828,6 +828,37 @@ static int qedi_task_xmit(struct iscsi_task *task)
 	return qedi_iscsi_send_ioreq(task);
 }
 
+static void qedi_offload_work(struct work_struct *work)
+{
+	struct qedi_endpoint *qedi_ep =
+		container_of(work, struct qedi_endpoint, offload_work);
+	struct qedi_ctx *qedi;
+	int wait_delay = 5 * HZ;
+	int ret;
+
+	qedi = qedi_ep->qedi;
+
+	ret = qedi_iscsi_offload_conn(qedi_ep);
+	if (ret) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n",
+			 qedi_ep->iscsi_cid, qedi_ep, ret);
+		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
+		return;
+	}
+
+	ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait,
+					       (qedi_ep->state ==
+					       EP_STATE_OFLDCONN_COMPL),
+					       wait_delay);
+	if (ret <= 0 || qedi_ep->state != EP_STATE_OFLDCONN_COMPL) {
+		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n",
+			 qedi_ep->iscsi_cid, qedi_ep);
+	}
+}
+
 static struct iscsi_endpoint *
 qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
 		int non_blocking)
@@ -876,6 +907,7 @@ qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
 	}
 	qedi_ep = ep->dd_data;
 	memset(qedi_ep, 0, sizeof(struct qedi_endpoint));
+	INIT_WORK(&qedi_ep->offload_work, qedi_offload_work);
 	qedi_ep->state = EP_STATE_IDLE;
 	qedi_ep->iscsi_cid = (u32)-1;
 	qedi_ep->qedi = qedi;
@@ -1026,12 +1058,11 @@ static void qedi_ep_disconnect(struct iscsi_endpoint *ep)
 	qedi_ep = ep->dd_data;
 	qedi = qedi_ep->qedi;
 
+	flush_work(&qedi_ep->offload_work);
+
 	if (qedi_ep->state == EP_STATE_OFLDCONN_START)
 		goto ep_exit_recover;
 
-	if (qedi_ep->state != EP_STATE_OFLDCONN_NONE)
-		flush_work(&qedi_ep->offload_work);
-
 	if (qedi_ep->conn) {
 		qedi_conn = qedi_ep->conn;
 		conn = qedi_conn->cls_conn->dd_data;
@@ -1196,37 +1227,6 @@ static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid)
 	return rc;
 }
 
-static void qedi_offload_work(struct work_struct *work)
-{
-	struct qedi_endpoint *qedi_ep =
-		container_of(work, struct qedi_endpoint, offload_work);
-	struct qedi_ctx *qedi;
-	int wait_delay = 5 * HZ;
-	int ret;
-
-	qedi = qedi_ep->qedi;
-
-	ret = qedi_iscsi_offload_conn(qedi_ep);
-	if (ret) {
-		QEDI_ERR(&qedi->dbg_ctx,
-			 "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n",
-			 qedi_ep->iscsi_cid, qedi_ep, ret);
-		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
-		return;
-	}
-
-	ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait,
-					       (qedi_ep->state ==
-					       EP_STATE_OFLDCONN_COMPL),
-					       wait_delay);
-	if ((ret <= 0) || (qedi_ep->state != EP_STATE_OFLDCONN_COMPL)) {
-		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
-		QEDI_ERR(&qedi->dbg_ctx,
-			 "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n",
-			 qedi_ep->iscsi_cid, qedi_ep);
-	}
-}
-
 static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data)
 {
 	struct qedi_ctx *qedi;
@@ -1342,7 +1342,6 @@ static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data)
 			  qedi_ep->dst_addr, qedi_ep->dst_port);
 	}
 
-	INIT_WORK(&qedi_ep->offload_work, qedi_offload_work);
 	queue_work(qedi->offload_thread, &qedi_ep->offload_work);
 
 	ret = 0;
-- 
Gitee


From 7bec28503270e64c6bd754f2d882beeef7c3278a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 26 Jul 2022 17:39:00 +0800
Subject: [PATCH 063/132] stat: fix inconsistency between struct stat and
 struct compat_stat

stable inclusion
from stable-v5.10.113
commit 76101c8e0c31938e59bbcb4beed47c8d8b89d39a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=76101c8e0c31938e59bbcb4beed47c8d8b89d39a

--------------------------------

[ Upstream commit 932aba1e169090357a77af18850a10c256b50819 ]

struct stat (defined in arch/x86/include/uapi/asm/stat.h) has 32-bit
st_dev and st_rdev; struct compat_stat (defined in
arch/x86/include/asm/compat.h) has 16-bit st_dev and st_rdev followed by
a 16-bit padding.

This patch fixes struct compat_stat to match struct stat.

[ Historical note: the old x86 'struct stat' did have that 16-bit field
  that the compat layer had kept around, but it was changes back in 2003
  by "struct stat - support larger dev_t":

    https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=e95b2065677fe32512a597a79db94b77b90c968d

  and back in those days, the x86_64 port was still new, and separate
  from the i386 code, and had already picked up the old version with a
  16-bit st_dev field ]

Note that we can't change compat_dev_t because it is used by
compat_loop_info.

Also, if the st_dev and st_rdev values are 32-bit, we don't have to use
old_valid_dev to test if the value fits into them.  This fixes
-EOVERFLOW on filesystems that are on NVMe because NVMe uses the major
number 259.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Andreas Schwab <schwab@linux-m68k.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/x86/include/asm/compat.h |  6 ++----
 fs/stat.c                     | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 0e327a01f50f..46a067bd7e0b 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -29,15 +29,13 @@ typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 
 struct compat_stat {
-	compat_dev_t	st_dev;
-	u16		__pad1;
+	u32		st_dev;
 	compat_ino_t	st_ino;
 	compat_mode_t	st_mode;
 	compat_nlink_t	st_nlink;
 	__compat_uid_t	st_uid;
 	__compat_gid_t	st_gid;
-	compat_dev_t	st_rdev;
-	u16		__pad2;
+	u32		st_rdev;
 	u32		st_size;
 	u32		st_blksize;
 	u32		st_blocks;
diff --git a/fs/stat.c b/fs/stat.c
index 1196af4d1ea0..04550c0ba540 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -306,9 +306,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
 #  define choose_32_64(a,b) b
 #endif
 
-#define valid_dev(x)  choose_32_64(old_valid_dev(x),true)
-#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
-
 #ifndef INIT_STRUCT_STAT_PADDING
 #  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
 #endif
@@ -317,7 +314,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 {
 	struct stat tmp;
 
-	if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+	if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+		return -EOVERFLOW;
+	if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
 		return -EOVERFLOW;
 #if BITS_PER_LONG == 32
 	if (stat->size > MAX_NON_LFS)
@@ -325,7 +324,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 #endif
 
 	INIT_STRUCT_STAT_PADDING(tmp);
-	tmp.st_dev = encode_dev(stat->dev);
+	tmp.st_dev = new_encode_dev(stat->dev);
 	tmp.st_ino = stat->ino;
 	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
 		return -EOVERFLOW;
@@ -335,7 +334,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 		return -EOVERFLOW;
 	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
 	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
-	tmp.st_rdev = encode_dev(stat->rdev);
+	tmp.st_rdev = new_encode_dev(stat->rdev);
 	tmp.st_size = stat->size;
 	tmp.st_atime = stat->atime.tv_sec;
 	tmp.st_mtime = stat->mtime.tv_sec;
@@ -616,11 +615,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 {
 	struct compat_stat tmp;
 
-	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+	if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+		return -EOVERFLOW;
+	if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
 		return -EOVERFLOW;
 
 	memset(&tmp, 0, sizeof(tmp));
-	tmp.st_dev = old_encode_dev(stat->dev);
+	tmp.st_dev = new_encode_dev(stat->dev);
 	tmp.st_ino = stat->ino;
 	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
 		return -EOVERFLOW;
@@ -630,7 +631,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 		return -EOVERFLOW;
 	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
 	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
-	tmp.st_rdev = old_encode_dev(stat->rdev);
+	tmp.st_rdev = new_encode_dev(stat->rdev);
 	if ((u64) stat->size > MAX_NON_LFS)
 		return -EOVERFLOW;
 	tmp.st_size = stat->size;
-- 
Gitee


From 47590400345a095df3f6c386c59924849fc0fe28 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Jul 2022 17:39:01 +0800
Subject: [PATCH 064/132] nvme: add a quirk to disable namespace identifiers

stable inclusion
from stable-v5.10.113
commit 316bd86c2261fffa37577c3d35721a9337248e54
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=316bd86c2261fffa37577c3d35721a9337248e54

--------------------------------

[ Upstream commit 00ff400e6deee00f7b15e200205b2708b63b8cf6 ]

Add a quirk to disable using and exporting namespace identifiers for
controllers where they are broken beyond repair.

The most directly visible problem with non-unique namespace identifiers
is that they break the /dev/disk/by-id/ links, with the link for a
supposedly unique identifier now pointing to one of multiple possible
namespaces that share the same ID, and a somewhat random selection of
which one actually shows up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/nvme/host/core.c | 24 ++++++++++++++++++------
 drivers/nvme/host/nvme.h |  5 +++++
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dcc047f01a07..274635c0c02a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1250,6 +1250,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 				 warn_str, cur->nidl);
 			return -1;
 		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_EUI64_LEN;
 		memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
 		return NVME_NIDT_EUI64_LEN;
 	case NVME_NIDT_NGUID:
@@ -1258,6 +1260,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 				 warn_str, cur->nidl);
 			return -1;
 		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_NGUID_LEN;
 		memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
 		return NVME_NIDT_NGUID_LEN;
 	case NVME_NIDT_UUID:
@@ -1266,6 +1270,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 				 warn_str, cur->nidl);
 			return -1;
 		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_UUID_LEN;
 		uuid_copy(&ids->uuid, data + sizeof(*cur));
 		return NVME_NIDT_UUID_LEN;
 	case NVME_NIDT_CSI:
@@ -1361,12 +1367,18 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if ((*id)->ncap == 0) /* namespace not allocated or attached */
 		goto out_free_id;
 
-	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
-	    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
-		memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
-	if (ctrl->vs >= NVME_VS(1, 2, 0) &&
-	    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
-		memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
+
+	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
+		dev_info(ctrl->device,
+			 "Ignoring bogus Namespace Identifiers\n");
+	} else {
+		if (ctrl->vs >= NVME_VS(1, 1, 0) &&
+		    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+			memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
+		if (ctrl->vs >= NVME_VS(1, 2, 0) &&
+		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+			memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
+	}
 
 	return 0;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 94cee2c566d3..11d3cc2890f9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -151,6 +151,11 @@ enum nvme_quirks {
 	 * encoding the generation sequence number.
 	 */
 	NVME_QUIRK_SKIP_CID_GEN			= (1 << 17),
+
+	/*
+	 * Reports garbage in the namespace identifiers (eui64, nguid, uuid).
+	 */
+	NVME_QUIRK_BOGUS_NID			= (1 << 18),
 };
 
 /*
-- 
Gitee


From 983009806cf6c01a28ac7890c08a83cc38e75bc8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Jul 2022 17:39:02 +0800
Subject: [PATCH 065/132] nvme-pci: disable namespace identifiers for Qemu
 controllers

stable inclusion
from stable-v5.10.113
commit 7ec6e06ee405756f90226b946790a5a7a65795b7
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7ec6e06ee405756f90226b946790a5a7a65795b7

--------------------------------

[ Upstream commit 66dd346b84d79fde20832ed691a54f4881eac20d ]

Qemu unconditionally reports a UUID, which depending on the qemu version
is either all-null (which is incorrect but harmless) or contains a single
bit set for all controllers.  In addition it can also optionally report
a eui64 which needs to be manually set.  Disable namespace identifiers
for Qemu controlles entirely even if in some cases they could be set
correctly through manual intervention.

Reported-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/nvme/host/pci.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f435ab0809fb..e0a3d03198a2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3212,7 +3212,10 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
 	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
-				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+				NVME_QUIRK_DISABLE_WRITE_ZEROES |
+				NVME_QUIRK_BOGUS_NID, },
+	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
+		.driver_data = NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
 	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
-- 
Gitee


From 3d0cf64034acd4253f9d7e4e54c0ece407faa42c Mon Sep 17 00:00:00 2001
From: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
Date: Tue, 26 Jul 2022 17:39:03 +0800
Subject: [PATCH 066/132] EDAC/synopsys: Read the error count from the correct
 register

stable inclusion
from stable-v5.10.113
commit 50cbc583fa838a63f8447251b88da569c3d36ba6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=50cbc583fa838a63f8447251b88da569c3d36ba6

--------------------------------

commit e2932d1f6f055b2af2114c7e64a26dc1b5593d0c upstream.

Currently, the error count is read wrongly from the status register. Read
the count from the proper error count register (ERRCNT).

  [ bp: Massage. ]

Fixes: b500b4a029d5 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller")
Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Michal Simek <michal.simek@xilinx.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20220414102813.4468-1-shubhrajyoti.datta@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/edac/synopsys_edac.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c
index 92906b56b1a2..fea44dc0484b 100644
--- a/drivers/edac/synopsys_edac.c
+++ b/drivers/edac/synopsys_edac.c
@@ -163,6 +163,11 @@
 #define ECC_STAT_CECNT_SHIFT		8
 #define ECC_STAT_BITNUM_MASK		0x7F
 
+/* ECC error count register definitions */
+#define ECC_ERRCNT_UECNT_MASK		0xFFFF0000
+#define ECC_ERRCNT_UECNT_SHIFT		16
+#define ECC_ERRCNT_CECNT_MASK		0xFFFF
+
 /* DDR QOS Interrupt register definitions */
 #define DDR_QOS_IRQ_STAT_OFST		0x20200
 #define DDR_QOSUE_MASK			0x4
@@ -418,15 +423,16 @@ static int zynqmp_get_error_info(struct synps_edac_priv *priv)
 	base = priv->baseaddr;
 	p = &priv->stat;
 
+	regval = readl(base + ECC_ERRCNT_OFST);
+	p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK;
+	p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT;
+	if (!p->ce_cnt)
+		goto ue_err;
+
 	regval = readl(base + ECC_STAT_OFST);
 	if (!regval)
 		return 1;
 
-	p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT;
-	p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT;
-	if (!p->ce_cnt)
-		goto ue_err;
-
 	p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK);
 
 	regval = readl(base + ECC_CEADDR0_OFST);
-- 
Gitee


From eb179e5cfb436d79cb432f855a26e7596e24531e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 26 Jul 2022 17:39:04 +0800
Subject: [PATCH 067/132] mm, hugetlb: allow for "high" userspace addresses

stable inclusion
from stable-v5.10.113
commit 6b932920b96fc3002352fe8225ec63a1cd1717ec
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6b932920b96fc3002352fe8225ec63a1cd1717ec

--------------------------------

commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream.

This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high"
userspace addresses") for hugetlb.

This patch adds support for "high" userspace addresses that are
optionally supported on the system and have to be requested via a hint
mechanism ("high" addr parameter to mmap).

Architectures such as powerpc and x86 achieve this by making changes to
their architectural versions of hugetlb_get_unmapped_area() function.
However, arm64 uses the generic version of that function.

So take into account arch_get_mmap_base() and arch_get_mmap_end() in
hugetlb_get_unmapped_area().  To allow that, move those two macros out
of mm/mmap.c into include/linux/sched/mm.h

If these macros are not defined in architectural code then they default
to (TASK_SIZE) and (base) so should not introduce any behavioural
changes to architectures that do not define them.

For the time being, only ARM64 is affected by this change.

Catalin (ARM64) said
 "We should have fixed hugetlb_get_unmapped_area() as well when we added
  support for 52-bit VA. The reason for commit f6795053dac8 was to
  prevent normal mmap() from returning addresses above 48-bit by default
  as some user-space had hard assumptions about this.

  It's a slight ABI change if you do this for hugetlb_get_unmapped_area()
  but I doubt anyone would notice. It's more likely that the current
  behaviour would cause issues, so I'd rather have them consistent.

  Basically when arm64 gained support for 52-bit addresses we did not
  want user-space calling mmap() to suddenly get such high addresses,
  otherwise we could have inadvertently broken some programs (similar
  behaviour to x86 here). Hence we added commit f6795053dac8. But we
  missed hugetlbfs which could still get such high mmap() addresses. So
  in theory that's a potential regression that should have bee addressed
  at the same time as commit f6795053dac8 (and before arm64 enabled
  52-bit addresses)"

Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu
Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: <stable@vger.kernel.org>	[5.0.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>

 Conflicts:
	fs/hugetlbfs/inode.c
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/hugetlbfs/inode.c     | 9 +++++----
 include/linux/sched/mm.h | 8 ++++++++
 mm/mmap.c                | 8 --------
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6f2943465bff..8a87d1b43387 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,7 +252,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = current->mm->mmap_base;
-	info.high_limit = TASK_SIZE;
+	info.high_limit = arch_get_mmap_end(addr);
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 
@@ -272,7 +272,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-	info.high_limit = current->mm->mmap_base;
+	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 
@@ -291,7 +291,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = current->mm->mmap_base;
-		info.high_limit = TASK_SIZE;
+		info.high_limit = arch_get_mmap_end(addr);
 
 		if (enable_mmap_dvpp)
 			dvpp_mmap_get_area(&info, flags);
@@ -309,6 +309,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct hstate *h = hstate_file(file);
+	const unsigned long mmap_end = arch_get_mmap_end(addr);
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
@@ -328,7 +329,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 			return -ENOMEM;
 
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr &&
+		if (mmap_end - len >= addr &&
 		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index dc1f4dcd9a82..e3e5e149b00e 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -106,6 +106,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 #endif /* CONFIG_MEMCG */
 
 #ifdef CONFIG_MMU
+#ifndef arch_get_mmap_end
+#define arch_get_mmap_end(addr)	(TASK_SIZE)
+#endif
+
+#ifndef arch_get_mmap_base
+#define arch_get_mmap_base(addr, base) (base)
+#endif
+
 extern void arch_pick_mmap_layout(struct mm_struct *mm,
 				  struct rlimit *rlim_stack);
 extern unsigned long
diff --git a/mm/mmap.c b/mm/mmap.c
index 5ad32537604a..5489d70db84e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2404,14 +2404,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
 	return addr;
 }
 
-#ifndef arch_get_mmap_end
-#define arch_get_mmap_end(addr)	(TASK_SIZE)
-#endif
-
-#ifndef arch_get_mmap_base
-#define arch_get_mmap_base(addr, base) (base)
-#endif
-
 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
-- 
Gitee


From ccc63440f2c0b4d4473bfecbaf1a19e3c14afce7 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Tue, 26 Jul 2022 17:39:05 +0800
Subject: [PATCH 068/132] mm/mmu_notifier.c: fix race in
 mmu_interval_notifier_remove()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit 9ca66d79143980260be615b964b8dc1504a5d0c6
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9ca66d79143980260be615b964b8dc1504a5d0c6

--------------------------------

commit 319561669a59d8e9206ab311ae5433ef92fd79d1 upstream.

In some cases it is possible for mmu_interval_notifier_remove() to race
with mn_tree_inv_end() allowing it to return while the notifier data
structure is still in use.  Consider the following sequence:

  CPU0 - mn_tree_inv_end()            CPU1 - mmu_interval_notifier_remove()
  ----------------------------------- ------------------------------------
                                      spin_lock(subscriptions->lock);
                                      seq = subscriptions->invalidate_seq;
  spin_lock(subscriptions->lock);     spin_unlock(subscriptions->lock);
  subscriptions->invalidate_seq++;
                                      wait_event(invalidate_seq != seq);
                                      return;
  interval_tree_remove(interval_sub); kfree(interval_sub);
  spin_unlock(subscriptions->lock);
  wake_up_all();

As the wait_event() condition is true it will return immediately.  This
can lead to use-after-free type errors if the caller frees the data
structure containing the interval notifier subscription while it is
still on a deferred list.  Fix this by taking the appropriate lock when
reading invalidate_seq to ensure proper synchronisation.

I observed this whilst running stress testing during some development.
You do have to be pretty unlucky, but it leads to the usual problems of
use-after-free (memory corruption, kernel crash, difficult to diagnose
WARN_ON, etc).

Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com
Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier")
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 mm/mmu_notifier.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 07f42a7a6065..9165ca619c8c 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -1043,6 +1043,18 @@ int mmu_interval_notifier_insert_locked(
 }
 EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
 
+static bool
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
+			  unsigned long seq)
+{
+	bool ret;
+
+	spin_lock(&subscriptions->lock);
+	ret = subscriptions->invalidate_seq != seq;
+	spin_unlock(&subscriptions->lock);
+	return ret;
+}
+
 /**
  * mmu_interval_notifier_remove - Remove a interval notifier
  * @interval_sub: Interval subscription to unregister
@@ -1090,7 +1102,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
 	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 	if (seq)
 		wait_event(subscriptions->wq,
-			   READ_ONCE(subscriptions->invalidate_seq) != seq);
+			   mmu_interval_seq_released(subscriptions, seq));
 
 	/* pairs with mmgrab in mmu_interval_notifier_insert() */
 	mmdrop(mm);
-- 
Gitee


From 0affb0e98c917096fe752a041443b815bb2b61ba Mon Sep 17 00:00:00 2001
From: Zheyu Ma <zheyuma97@gmail.com>
Date: Tue, 26 Jul 2022 17:39:06 +0800
Subject: [PATCH 069/132] ata: pata_marvell: Check the 'bmdma_addr' beforing
 reading

stable inclusion
from stable-v5.10.113
commit cf23a960c5c62ddd8283a855a742977ee983a62a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf23a960c5c62ddd8283a855a742977ee983a62a

--------------------------------

commit aafa9f958342db36c17ac2a7f1b841032c96feb4 upstream.

Before detecting the cable type on the dma bar, the driver should check
whether the 'bmdma_addr' is zero, which means the adapter does not
support DMA, otherwise we will get the following error:

[    5.146634] Bad IO access at port 0x1 (return inb(port))
[    5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60
[    5.150856] RIP: 0010:ioread8+0x4a/0x60
[    5.160238] Call Trace:
[    5.160470]  <TASK>
[    5.160674]  marvell_cable_detect+0x6e/0xc0 [pata_marvell]
[    5.161728]  ata_eh_recover+0x3520/0x6cc0
[    5.168075]  ata_do_eh+0x49/0x3c0

Signed-off-by: Zheyu Ma <zheyuma97@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/ata/pata_marvell.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c
index b066809ba9a1..c56f4043b0cc 100644
--- a/drivers/ata/pata_marvell.c
+++ b/drivers/ata/pata_marvell.c
@@ -83,6 +83,8 @@ static int marvell_cable_detect(struct ata_port *ap)
 	switch(ap->port_no)
 	{
 	case 0:
+		if (!ap->ioaddr.bmdma_addr)
+			return ATA_CBL_PATA_UNK;
 		if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1)
 			return ATA_CBL_PATA40;
 		return ATA_CBL_PATA80;
-- 
Gitee


From d5e4b0822bc88bcd32070ab02176509d7a9eb06a Mon Sep 17 00:00:00 2001
From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Date: Tue, 26 Jul 2022 17:39:07 +0800
Subject: [PATCH 070/132] dma: at_xdmac: fix a missing check on list iterator

stable inclusion
from stable-v5.10.113
commit 358a3846f6a950b8db803262e74081ad4ab235d4
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=358a3846f6a950b8db803262e74081ad4ab235d4

--------------------------------

commit 206680c4e46b62fd8909385e0874a36952595b85 upstream.

The bug is here:
	__func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue);

The list iterator 'desc' will point to a bogus position containing
HEAD if the list is empty or no element is found. To avoid dev_dbg()
prints a invalid address, use a new variable 'iter' as the list
iterator, while use the origin variable 'desc' as a dedicated
pointer to point to the found element.

Cc: stable@vger.kernel.org
Fixes: 82e2424635f4c ("dmaengine: xdmac: fix print warning on dma_addr_t variable")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Link: https://lore.kernel.org/r/20220327061154.4867-1-xiam0nd.tong@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/dma/at_xdmac.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 90afba0b36fe..47552db6b8dc 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1390,7 +1390,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 {
 	struct at_xdmac_chan	*atchan = to_at_xdmac_chan(chan);
 	struct at_xdmac		*atxdmac = to_at_xdmac(atchan->chan.device);
-	struct at_xdmac_desc	*desc, *_desc;
+	struct at_xdmac_desc	*desc, *_desc, *iter;
 	struct list_head	*descs_list;
 	enum dma_status		ret;
 	int			residue, retry;
@@ -1505,11 +1505,13 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 	 * microblock.
 	 */
 	descs_list = &desc->descs_list;
-	list_for_each_entry_safe(desc, _desc, descs_list, desc_node) {
-		dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg);
-		residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth;
-		if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
+	list_for_each_entry_safe(iter, _desc, descs_list, desc_node) {
+		dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg);
+		residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth;
+		if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) {
+			desc = iter;
 			break;
+		}
 	}
 	residue += cur_ubc << dwidth;
 
-- 
Gitee


From 2d19822cbd21ac865ee8a7dc3376e640a457ee1f Mon Sep 17 00:00:00 2001
From: Manuel Ullmann <labre@posteo.de>
Date: Tue, 26 Jul 2022 17:39:08 +0800
Subject: [PATCH 071/132] net: atlantic: invert deep par in pm functions,
 preventing null derefs

stable inclusion
from stable-v5.10.113
commit ba2716da233618c3f361b8ece818a0e91de7a8f0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ba2716da233618c3f361b8ece818a0e91de7a8f0

--------------------------------

commit cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c upstream.

This will reset deeply on freeze and thaw instead of suspend and
resume and prevent null pointer dereferences of the uninitialized ring
0 buffer while thawing.

The impact is an indefinitely hanging kernel. You can't switch
consoles after this and the only possible user interaction is SysRq.

BUG: kernel NULL pointer dereference
RIP: 0010:aq_ring_rx_fill+0xcf/0x210 [atlantic]
aq_vec_init+0x85/0xe0 [atlantic]
aq_nic_init+0xf7/0x1d0 [atlantic]
atl_resume_common+0x4f/0x100 [atlantic]
pci_pm_thaw+0x42/0xa0

resolves in aq_ring.o to

```
0000000000000ae0 <aq_ring_rx_fill>:
{
/* ... */
 baf:	48 8b 43 08          	mov    0x8(%rbx),%rax
 		buff->flags = 0U; /* buff is NULL */
```

The bug has been present since the introduction of the new pm code in
8aaa112a57c1 ("net: atlantic: refactoring pm logic") and was hidden
until 8ce84271697a ("net: atlantic: changes for multi-TC support"),
which refactored the aq_vec_{free,alloc} functions into
aq_vec_{,ring}_{free,alloc}, but is technically not wrong. The
original functions just always reinitialized the buffers on S3/S4. If
the interface is down before freezing, the bug does not occur. It does
not matter, whether the initrd contains and loads the module before
thawing.

So the fix is to invert the boolean parameter deep in all pm function
calls, which was clearly intended to be set like that.

First report was on Github [1], which you have to guess from the
resume logs in the posted dmesg snippet. Recently I posted one on
Bugzilla [2], since I did not have an AQC device so far.

#regzbot introduced: 8ce84271697a
#regzbot from: koo5 <kolman.jindrich@gmail.com>
#regzbot monitor: https://github.com/Aquantia/AQtion/issues/32

Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic")
Link: https://github.com/Aquantia/AQtion/issues/32 [1]
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215798 [2]
Cc: stable@vger.kernel.org
Reported-by: koo5 <kolman.jindrich@gmail.com>
Signed-off-by: Manuel Ullmann <labre@posteo.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
index 1826253f97dc..bdfd462c74db 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
@@ -450,22 +450,22 @@ static int atl_resume_common(struct device *dev, bool deep)
 
 static int aq_pm_freeze(struct device *dev)
 {
-	return aq_suspend_common(dev, false);
+	return aq_suspend_common(dev, true);
 }
 
 static int aq_pm_suspend_poweroff(struct device *dev)
 {
-	return aq_suspend_common(dev, true);
+	return aq_suspend_common(dev, false);
 }
 
 static int aq_pm_thaw(struct device *dev)
 {
-	return atl_resume_common(dev, false);
+	return atl_resume_common(dev, true);
 }
 
 static int aq_pm_resume_restore(struct device *dev)
 {
-	return atl_resume_common(dev, true);
+	return atl_resume_common(dev, false);
 }
 
 static const struct dev_pm_ops aq_pm_ops = {
-- 
Gitee


From e85467a78402f38cdbc4a15730731610dd8aa195 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Tue, 26 Jul 2022 17:39:09 +0800
Subject: [PATCH 072/132] xtensa: patch_text: Fixup last cpu should be master

stable inclusion
from stable-v5.10.113
commit f399ab11dd6cc0a6927029371fbe2ec027181598
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f399ab11dd6cc0a6927029371fbe2ec027181598

--------------------------------

commit ee69d4be8fd064cd08270b4808d2dfece3614ee0 upstream.

These patch_text implementations are using stop_machine_cpuslocked
infrastructure with atomic cpu_count. The original idea: When the
master CPU patch_text, the others should wait for it. But current
implementation is using the first CPU as master, which couldn't
guarantee the remaining CPUs are waiting. This patch changes the
last CPU as the master to solve the potential risk.

Fixes: 64711f9a47d4 ("xtensa: implement jump_label support")
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: <stable@vger.kernel.org>
Message-Id: <20220407073323.743224-4-guoren@kernel.org>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/xtensa/kernel/jump_label.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/xtensa/kernel/jump_label.c b/arch/xtensa/kernel/jump_label.c
index 0dde21e0d3de..ad1841cecdfb 100644
--- a/arch/xtensa/kernel/jump_label.c
+++ b/arch/xtensa/kernel/jump_label.c
@@ -40,7 +40,7 @@ static int patch_text_stop_machine(void *data)
 {
 	struct patch *patch = data;
 
-	if (atomic_inc_return(&patch->cpu_count) == 1) {
+	if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) {
 		local_patch_text(patch->addr, patch->data, patch->sz);
 		atomic_inc(&patch->cpu_count);
 	} else {
-- 
Gitee


From 90fa436f8a1afa90f01292c96ed7813fe10371d5 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Tue, 26 Jul 2022 17:39:10 +0800
Subject: [PATCH 073/132] xtensa: fix a7 clobbering in coprocessor context
 load/store

stable inclusion
from stable-v5.10.113
commit 19f6dcb1f0f0f8523976c8aa1800856c9b4f35c3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19f6dcb1f0f0f8523976c8aa1800856c9b4f35c3

--------------------------------

commit 839769c35477d4acc2369e45000ca7b0b6af39a7 upstream.

Fast coprocessor exception handler saves a3..a6, but coprocessor context
load/store code uses a4..a7 as temporaries, potentially clobbering a7.
'Potentially' because coprocessor state load/store macros may not use
all four temporary registers (and neither FPU nor HiFi macros do).
Use a3..a6 as intended.

Cc: stable@vger.kernel.org
Fixes: c658eac628aa ("[XTENSA] Add support for configurable registers and coprocessors")
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/xtensa/kernel/coprocessor.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S
index 45cc0ae0af6f..c7b9f12896f2 100644
--- a/arch/xtensa/kernel/coprocessor.S
+++ b/arch/xtensa/kernel/coprocessor.S
@@ -29,7 +29,7 @@
 	.if XTENSA_HAVE_COPROCESSOR(x);					\
 		.align 4;						\
 	.Lsave_cp_regs_cp##x:						\
-		xchal_cp##x##_store a2 a4 a5 a6 a7;			\
+		xchal_cp##x##_store a2 a3 a4 a5 a6;			\
 		jx	a0;						\
 	.endif
 
@@ -46,7 +46,7 @@
 	.if XTENSA_HAVE_COPROCESSOR(x);					\
 		.align 4;						\
 	.Lload_cp_regs_cp##x:						\
-		xchal_cp##x##_load a2 a4 a5 a6 a7;			\
+		xchal_cp##x##_load a2 a3 a4 a5 a6;			\
 		jx	a0;						\
 	.endif
 
-- 
Gitee


From 490085c27ad981285d7f65b677a78bd578084e98 Mon Sep 17 00:00:00 2001
From: Paolo Valerio <pvalerio@redhat.com>
Date: Tue, 26 Jul 2022 17:39:11 +0800
Subject: [PATCH 074/132] openvswitch: fix OOB access in reserve_sfa_size()

stable inclusion
from stable-v5.10.113
commit 0837ff17d052b7d755d5086208c3445867aaff82
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0837ff17d052b7d755d5086208c3445867aaff82

--------------------------------

commit cefa91b2332d7009bc0be5d951d6cbbf349f90f8 upstream.

Given a sufficiently large number of actions, while copying and
reserving memory for a new action of a new flow, if next_offset is
greater than MAX_ACTIONS_BUFSIZE, the function reserve_sfa_size() does
not return -EMSGSIZE as expected, but it allocates MAX_ACTIONS_BUFSIZE
bytes increasing actions_len by req_size. This can then lead to an OOB
write access, especially when further actions need to be copied.

Fix it by rearranging the flow action size check.

KASAN splat below:
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>

==================================================================
BUG: KASAN: slab-out-of-bounds in reserve_sfa_size+0x1ba/0x380 [openvswitch]
Write of size 65360 at addr ffff888147e4001c by task handler15/836

CPU: 1 PID: 836 Comm: handler15 Not tainted 5.18.0-rc1+ #27
...
Call Trace:
 <TASK>
 dump_stack_lvl+0x45/0x5a
 print_report.cold+0x5e/0x5db
 ? __lock_text_start+0x8/0x8
 ? reserve_sfa_size+0x1ba/0x380 [openvswitch]
 kasan_report+0xb5/0x130
 ? reserve_sfa_size+0x1ba/0x380 [openvswitch]
 kasan_check_range+0xf5/0x1d0
 memcpy+0x39/0x60
 reserve_sfa_size+0x1ba/0x380 [openvswitch]
 __add_action+0x24/0x120 [openvswitch]
 ovs_nla_add_action+0xe/0x20 [openvswitch]
 ovs_ct_copy_action+0x29d/0x1130 [openvswitch]
 ? __kernel_text_address+0xe/0x30
 ? unwind_get_return_address+0x56/0xa0
 ? create_prof_cpu_mask+0x20/0x20
 ? ovs_ct_verify+0xf0/0xf0 [openvswitch]
 ? prep_compound_page+0x198/0x2a0
 ? __kasan_check_byte+0x10/0x40
 ? kasan_unpoison+0x40/0x70
 ? ksize+0x44/0x60
 ? reserve_sfa_size+0x75/0x380 [openvswitch]
 __ovs_nla_copy_actions+0xc26/0x2070 [openvswitch]
 ? __zone_watermark_ok+0x420/0x420
 ? validate_set.constprop.0+0xc90/0xc90 [openvswitch]
 ? __alloc_pages+0x1a9/0x3e0
 ? __alloc_pages_slowpath.constprop.0+0x1da0/0x1da0
 ? unwind_next_frame+0x991/0x1e40
 ? __mod_node_page_state+0x99/0x120
 ? __mod_lruvec_page_state+0x2e3/0x470
 ? __kasan_kmalloc_large+0x90/0xe0
 ovs_nla_copy_actions+0x1b4/0x2c0 [openvswitch]
 ovs_flow_cmd_new+0x3cd/0xb10 [openvswitch]
 ...

Cc: stable@vger.kernel.org
Fixes: f28cd2af22a0 ("openvswitch: fix flow actions reallocation")
Signed-off-by: Paolo Valerio <pvalerio@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 net/openvswitch/flow_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 98a7e6f64ab0..293a798e89f4 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2436,7 +2436,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
 	new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
 
 	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
-		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
+		if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) {
 			OVS_NLERR(log, "Flow action size exceeds max %u",
 				  MAX_ACTIONS_BUFSIZE);
 			return ERR_PTR(-EMSGSIZE);
-- 
Gitee


From 140cdaa1ebf8c1c01a60b1f56522154c58dc71ce Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 26 Jul 2022 17:39:12 +0800
Subject: [PATCH 075/132] gpio: Request interrupts after IRQ is initialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

stable inclusion
from stable-v5.10.113
commit 54e6180c8c2d71b6f1fab0d1420f65299b26953e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=54e6180c8c2d71b6f1fab0d1420f65299b26953e

--------------------------------

commit 06fb4ecfeac7e00d6704fa5ed19299f2fefb3cc9 upstream.

Commit 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members
before initialization") attempted to fix a race condition that lead to a
NULL pointer, but in the process caused a regression for _AEI/_EVT
declared GPIOs.

This manifests in messages showing deferred probing while trying to
allocate IRQs like so:

  amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x0000 to IRQ, err -517
  amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x002C to IRQ, err -517
  amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x003D to IRQ, err -517
  [ .. more of the same .. ]

The code for walking _AEI doesn't handle deferred probing and so this
leads to non-functional GPIO interrupts.

Fix this issue by moving the call to `acpi_gpiochip_request_interrupts`
to occur after gc->irc.initialized is set.

Fixes: 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization")
Link: https://lore.kernel.org/linux-gpio/BL1PR12MB51577A77F000A008AA694675E2EF9@BL1PR12MB5157.namprd12.prod.outlook.com/
Link: https://bugzilla.suse.com/show_bug.cgi?id=1198697
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215850
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1979
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1976
Reported-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Shreeya Patel <shreeya.patel@collabora.com>
Tested-By: Samuel Čavoj <samuel@cavoj.net>
Tested-By: lukeluk498@gmail.com Link:
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-and-tested-by: Takashi Iwai <tiwai@suse.de>
Cc: Shreeya Patel <shreeya.patel@collabora.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpio/gpiolib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d18078748200..59d8affad343 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1612,8 +1612,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
 
 	gpiochip_set_irq_hooks(gc);
 
-	acpi_gpiochip_request_interrupts(gc);
-
 	/*
 	 * Using barrier() here to prevent compiler from reordering
 	 * gc->irq.initialized before initialization of above
@@ -1623,6 +1621,8 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
 
 	gc->irq.initialized = true;
 
+	acpi_gpiochip_request_interrupts(gc);
+
 	return 0;
 }
 
-- 
Gitee


From 9d7cda5f4e3dc4ab3acf27cd75d78b156a0cc099 Mon Sep 17 00:00:00 2001
From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Date: Tue, 26 Jul 2022 17:39:13 +0800
Subject: [PATCH 076/132] ASoC: soc-dapm: fix two incorrect uses of list
 iterator

stable inclusion
from stable-v5.10.113
commit 43a2a3734aa3d25ef11b90f14f18bf31fb0a7feb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43a2a3734aa3d25ef11b90f14f18bf31fb0a7feb

--------------------------------

commit f730a46b931d894816af34a0ff8e4ad51565b39f upstream.

These two bug are here:
	list_for_each_entry_safe_continue(w, n, list,
					power_list);
	list_for_each_entry_safe_continue(w, n, list,
					power_list);

After the list_for_each_entry_safe_continue() exits, the list iterator
will always be a bogus pointer which point to an invalid struct objdect
containing HEAD member. The funciton poniter 'w->event' will be a
invalid value which can lead to a control-flow hijack if the 'w' can be
controlled.

The original intention was to continue the outer list_for_each_entry_safe()
loop with the same entry if w->event is NULL, but misunderstanding the
meaning of list_for_each_entry_safe_continue().

So just add a 'continue;' to fix the bug.

Cc: stable@vger.kernel.org
Fixes: 163cac061c973 ("ASoC: Factor out DAPM sequence execution")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Link: https://lore.kernel.org/r/20220329012134.9375-1-xiam0nd.tong@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 sound/soc/soc-dapm.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 2924d89bf0da..417732bdf286 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1683,8 +1683,7 @@ static void dapm_seq_run(struct snd_soc_card *card,
 		switch (w->id) {
 		case snd_soc_dapm_pre:
 			if (!w->event)
-				list_for_each_entry_safe_continue(w, n, list,
-								  power_list);
+				continue;
 
 			if (event == SND_SOC_DAPM_STREAM_START)
 				ret = w->event(w,
@@ -1696,8 +1695,7 @@ static void dapm_seq_run(struct snd_soc_card *card,
 
 		case snd_soc_dapm_post:
 			if (!w->event)
-				list_for_each_entry_safe_continue(w, n, list,
-								  power_list);
+				continue;
 
 			if (event == SND_SOC_DAPM_STREAM_START)
 				ret = w->event(w,
-- 
Gitee


From 2e5f28ee0f8c9876e8af1a9347aee57594f8478d Mon Sep 17 00:00:00 2001
From: Sasha Neftin <sasha.neftin@intel.com>
Date: Tue, 26 Jul 2022 17:39:14 +0800
Subject: [PATCH 077/132] e1000e: Fix possible overflow in LTR decoding

stable inclusion
from stable-v5.10.113
commit 7082650eb8265b583d0aea0515bcc6f65b0e8755
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7082650eb8265b583d0aea0515bcc6f65b0e8755

--------------------------------

commit 04ebaa1cfddae5f240cc7404f009133bb0389a47 upstream.

When we decode the latency and the max_latency, u16 value may not fit
the required size and could lead to the wrong LTR representation.

Scaling is represented as:
scale 0 - 1         (2^(5*0)) = 2^0
scale 1 - 32        (2^(5 *1))= 2^5
scale 2 - 1024      (2^(5 *2)) =2^10
scale 3 - 32768     (2^(5 *3)) =2^15
scale 4 - 1048576   (2^(5 *4)) = 2^20
scale 5 - 33554432  (2^(5 *4)) = 2^25
scale 4 and scale 5 required 20 and 25 bits respectively.
scale 6 reserved.

Replace the u16 type with the u32 type and allow corrected LTR
representation.

Cc: stable@vger.kernel.org
Fixes: 44a13a5d99c7 ("e1000e: Fix the max snoop/no-snoop latency for 10M")
Reported-by: James Hutchinson <jahutchinson99@googlemail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215689
Suggested-by: Dima Ruinskiy <dima.ruinskiy@intel.com>
Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Naama Meir <naamax.meir@linux.intel.com>
Tested-by: James Hutchinson <jahutchinson99@googlemail.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index 15b1503d5b6c..1f51252b465a 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1006,8 +1006,8 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link)
 {
 	u32 reg = link << (E1000_LTRV_REQ_SHIFT + E1000_LTRV_NOSNOOP_SHIFT) |
 	    link << E1000_LTRV_REQ_SHIFT | E1000_LTRV_SEND;
-	u16 max_ltr_enc_d = 0;	/* maximum LTR decoded by platform */
-	u16 lat_enc_d = 0;	/* latency decoded */
+	u32 max_ltr_enc_d = 0;	/* maximum LTR decoded by platform */
+	u32 lat_enc_d = 0;	/* latency decoded */
 	u16 lat_enc = 0;	/* latency encoded */
 
 	if (link) {
-- 
Gitee


From d51577c959b12c0ff173292fd7ebc9090ed20a29 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich@synopsys.com>
Date: Tue, 26 Jul 2022 17:39:15 +0800
Subject: [PATCH 078/132] ARC: entry: fix syscall_trace_exit argument

stable inclusion
from stable-v5.10.113
commit 5580b974a84b30f6da90a3a562ea0dbfb0038110
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5580b974a84b30f6da90a3a562ea0dbfb0038110

--------------------------------

commit b1c6ecfdd06907554518ec384ce8e99889d15193 upstream.

Function syscall_trace_exit expects pointer to pt_regs. However
r0 is also used to keep syscall return value. Restore pointer
to pt_regs before calling syscall_trace_exit.

Cc: <stable@vger.kernel.org>
Signed-off-by: Sergey Matyukevich <sergey.matyukevich@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/arc/kernel/entry.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S
index ae656bfc31c3..301ade4d0b94 100644
--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@@ -199,6 +199,7 @@ tracesys_exit:
 	st  r0, [sp, PT_r0]     ; sys call return value in pt_regs
 
 	;POST Sys Call Ptrace Hook
+	mov r0, sp		; pt_regs needed
 	bl  @syscall_trace_exit
 	b   ret_from_exception ; NOT ret_from_system_call at is saves r0 which
 	; we'd done before calling post hook above
-- 
Gitee


From 1e6c493076318775eed92b3e3fe887ab4dfde197 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 26 Jul 2022 17:39:16 +0800
Subject: [PATCH 079/132] arm_pmu: Validate single/group leader events

stable inclusion
from stable-v5.10.113
commit c55327bc3712ca96dc05bc4c6aab805b6fa8bb74
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c55327bc3712ca96dc05bc4c6aab805b6fa8bb74

--------------------------------

commit e5c23779f93d45e39a52758ca593bd7e62e9b4be upstream.

In the case where there is only a cycle counter available (i.e.
PMCR_EL0.N is 0) and an event other than CPU cycles is opened, the open
should fail as the event can never possibly be scheduled. However, the
event validation when an event is opened is skipped when the group
leader is opened. Fix this by always validating the group leader events.

Reported-by: Al Grant <al.grant@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20220408203330.4014015-1-robh@kernel.org
Cc: <stable@vger.kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/perf/arm_pmu.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index fe075d9f95e2..c87faafbdba2 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -398,6 +398,9 @@ validate_group(struct perf_event *event)
 	if (!validate_event(event->pmu, &fake_pmu, leader))
 		return -EINVAL;
 
+	if (event == leader)
+		return 0;
+
 	for_each_sibling_event(sibling, leader) {
 		if (!validate_event(event->pmu, &fake_pmu, sibling))
 			return -EINVAL;
@@ -487,12 +490,7 @@ __hw_perf_event_init(struct perf_event *event)
 		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
-	if (event->group_leader != event) {
-		if (validate_group(event) != 0)
-			return -EINVAL;
-	}
-
-	return 0;
+	return validate_group(event);
 }
 
 static int armpmu_event_init(struct perf_event *event)
-- 
Gitee


From 8109e1614b619b8cb7791a76ca932a6e9da496d8 Mon Sep 17 00:00:00 2001
From: kuyo chang <kuyo.chang@mediatek.com>
Date: Tue, 26 Jul 2022 17:39:17 +0800
Subject: [PATCH 080/132] sched/pelt: Fix attach_entity_load_avg() corner case

stable inclusion
from stable-v5.10.113
commit 88fcfd6ee6c5a617e712b346e9c15fc3057e532e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=88fcfd6ee6c5a617e712b346e9c15fc3057e532e

--------------------------------

[ Upstream commit 40f5aa4c5eaebfeaca4566217cb9c468e28ed682 ]

The warning in cfs_rq_is_decayed() triggered:

    SCHED_WARN_ON(cfs_rq->avg.load_avg ||
		  cfs_rq->avg.util_avg ||
		  cfs_rq->avg.runnable_avg)

There exists a corner case in attach_entity_load_avg() which will
cause load_sum to be zero while load_avg will not be.

Consider se_weight is 88761 as per the sched_prio_to_weight[] table.
Further assume the get_pelt_divider() is 47742, this gives:
se->avg.load_avg is 1.

However, calculating load_sum:

  se->avg.load_sum = div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
  se->avg.load_sum = 1*47742/88761 = 0.

Then enqueue_load_avg() adds this to the cfs_rq totals:

  cfs_rq->avg.load_avg += se->avg.load_avg;
  cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;

Resulting in load_avg being 1 with load_sum is 0, which will trigger
the WARN.

Fixes: f207934fb79d ("sched/fair: Align PELT windows between cfs_rq and its se")
Signed-off-by: kuyo chang <kuyo.chang@mediatek.com>
[peterz: massage changelog]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lkml.kernel.org/r/20220414090229.342-1-kuyo.chang@mediatek.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/sched/fair.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 50d457979db6..09f002f1fe5d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3771,11 +3771,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 	se->avg.runnable_sum = se->avg.runnable_avg * divider;
 
-	se->avg.load_sum = divider;
-	if (se_weight(se)) {
-		se->avg.load_sum =
-			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
-	}
+	se->avg.load_sum = se->avg.load_avg * divider;
+	if (se_weight(se) < se->avg.load_sum)
+		se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+	else
+		se->avg.load_sum = 1;
 
 	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
-- 
Gitee


From 47edbfbf9e4c8b189615ea817ffc52e0dc011aed Mon Sep 17 00:00:00 2001
From: Zhipeng Xie <xiezhipeng1@huawei.com>
Date: Tue, 26 Jul 2022 17:39:18 +0800
Subject: [PATCH 081/132] perf/core: Fix perf_mmap fail when
 CONFIG_PERF_USE_VMALLOC enabled

stable inclusion
from stable-v5.10.113
commit 51d9cbbb0f5a175e5b4c4a25d2ec995363304860
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=51d9cbbb0f5a175e5b4c4a25d2ec995363304860

--------------------------------

[ Upstream commit 60490e7966659b26d74bf1fa4aa8693d9a94ca88 ]

This problem can be reproduced with CONFIG_PERF_USE_VMALLOC enabled on
both x86_64 and aarch64 arch when using sysdig -B(using ebpf)[1].
sysdig -B works fine after rebuilding the kernel with
CONFIG_PERF_USE_VMALLOC disabled.

I tracked it down to the if condition event->rb->nr_pages != nr_pages
in perf_mmap is true when CONFIG_PERF_USE_VMALLOC is enabled where
event->rb->nr_pages = 1 and nr_pages = 2048 resulting perf_mmap to
return -EINVAL. This is because when CONFIG_PERF_USE_VMALLOC is
enabled, rb->nr_pages is always equal to 1.

Arch with CONFIG_PERF_USE_VMALLOC enabled by default:
	arc/arm/csky/mips/sh/sparc/xtensa

Arch with CONFIG_PERF_USE_VMALLOC disabled by default:
	x86_64/aarch64/...

Fix this problem by using data_page_nr()

[1] https://github.com/draios/sysdig

Fixes: 906010b2134e ("perf_event: Provide vmalloc() based mmap() backing")
Signed-off-by: Zhipeng Xie <xiezhipeng1@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220209145417.6495-1-xiezhipeng1@huawei.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 kernel/events/core.c        | 2 +-
 kernel/events/internal.h    | 5 +++++
 kernel/events/ring_buffer.c | 5 -----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4bd9dd6c3b72..68dc8a8e7990 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6174,7 +6174,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 again:
 	mutex_lock(&event->mmap_mutex);
 	if (event->rb) {
-		if (event->rb->nr_pages != nr_pages) {
+		if (data_page_nr(event->rb) != nr_pages) {
 			ret = -EINVAL;
 			goto unlock;
 		}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 228801e20788..aa23ffdaf819 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb)
 }
 #endif
 
+static inline int data_page_nr(struct perf_buffer *rb)
+{
+	return rb->nr_pages << page_order(rb);
+}
+
 static inline unsigned long perf_data_size(struct perf_buffer *rb)
 {
 	return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ef91ae75ca56..4032cd475000 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -856,11 +856,6 @@ void rb_free(struct perf_buffer *rb)
 }
 
 #else
-static int data_page_nr(struct perf_buffer *rb)
-{
-	return rb->nr_pages << page_order(rb);
-}
-
 static struct page *
 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
 {
-- 
Gitee


From 5f70f8f020405ec27343ea6c8c7f580922443b4a Mon Sep 17 00:00:00 2001
From: Dave Stevenson <dave.stevenson@raspberrypi.com>
Date: Tue, 26 Jul 2022 17:39:19 +0800
Subject: [PATCH 082/132] drm/panel/raspberrypi-touchscreen: Avoid NULL deref
 if not initialised

stable inclusion
from stable-v5.10.113
commit 231381f5211620cec836b921f1c7a2cf702b3e8a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=231381f5211620cec836b921f1c7a2cf702b3e8a

--------------------------------

[ Upstream commit f92055ae0acb035891e988ce345d6b81a0316423 ]

If a call to rpi_touchscreen_i2c_write from rpi_touchscreen_probe
fails before mipi_dsi_device_register_full is called, then
in trying to log the error message if uses ts->dsi->dev when
it is still NULL.

Use ts->i2c->dev instead, which is initialised earlier in probe.

Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.")
Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-2-stefan.wahren@i2se.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
index bbdd086be7f5..90487df62480 100644
--- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
+++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
@@ -229,7 +229,7 @@ static void rpi_touchscreen_i2c_write(struct rpi_touchscreen *ts,
 
 	ret = i2c_smbus_write_byte_data(ts->i2c, reg, val);
 	if (ret)
-		dev_err(&ts->dsi->dev, "I2C write failed: %d\n", ret);
+		dev_err(&ts->i2c->dev, "I2C write failed: %d\n", ret);
 }
 
 static int rpi_touchscreen_write(struct rpi_touchscreen *ts, u16 reg, u32 val)
-- 
Gitee


From 27f402d6441a30ef1541f3bca32429aceb1ce8de Mon Sep 17 00:00:00 2001
From: Dave Stevenson <dave.stevenson@raspberrypi.com>
Date: Tue, 26 Jul 2022 17:39:20 +0800
Subject: [PATCH 083/132] drm/panel/raspberrypi-touchscreen: Initialise the
 bridge in prepare

stable inclusion
from stable-v5.10.113
commit 405d98427416849cf37c84c0c70bd5008b686a1e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=405d98427416849cf37c84c0c70bd5008b686a1e

--------------------------------

[ Upstream commit 5f18c0782b99e26121efa93d20b76c19e17aa1dd ]

The panel has a prepare call which is before video starts, and an
enable call which is after.
The Toshiba bridge should be configured before video, so move
the relevant power and initialisation calls to prepare.

Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.")
Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-3-stefan.wahren@i2se.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
index 90487df62480..4b92c6341490 100644
--- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
+++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
@@ -265,7 +265,7 @@ static int rpi_touchscreen_noop(struct drm_panel *panel)
 	return 0;
 }
 
-static int rpi_touchscreen_enable(struct drm_panel *panel)
+static int rpi_touchscreen_prepare(struct drm_panel *panel)
 {
 	struct rpi_touchscreen *ts = panel_to_ts(panel);
 	int i;
@@ -295,6 +295,13 @@ static int rpi_touchscreen_enable(struct drm_panel *panel)
 	rpi_touchscreen_write(ts, DSI_STARTDSI, 0x01);
 	msleep(100);
 
+	return 0;
+}
+
+static int rpi_touchscreen_enable(struct drm_panel *panel)
+{
+	struct rpi_touchscreen *ts = panel_to_ts(panel);
+
 	/* Turn on the backlight. */
 	rpi_touchscreen_i2c_write(ts, REG_PWM, 255);
 
@@ -349,7 +356,7 @@ static int rpi_touchscreen_get_modes(struct drm_panel *panel,
 static const struct drm_panel_funcs rpi_touchscreen_funcs = {
 	.disable = rpi_touchscreen_disable,
 	.unprepare = rpi_touchscreen_noop,
-	.prepare = rpi_touchscreen_noop,
+	.prepare = rpi_touchscreen_prepare,
 	.enable = rpi_touchscreen_enable,
 	.get_modes = rpi_touchscreen_get_modes,
 };
-- 
Gitee


From d4248c33b8a3403ca22cce4319412763975d9b9f Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 26 Jul 2022 17:39:21 +0800
Subject: [PATCH 084/132] KVM: PPC: Fix TCE handling for VFIO

stable inclusion
from stable-v5.10.113
commit f8f8b3124b899867a18a3f63e538c791e21252ac
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f8f8b3124b899867a18a3f63e538c791e21252ac

--------------------------------

[ Upstream commit 26a62b750a4e6364b0393562f66759b1494c3a01 ]

The LoPAPR spec defines a guest visible IOMMU with a variable page size.
Currently QEMU advertises 4K, 64K, 2M, 16MB pages, a Linux VM picks
the biggest (16MB). In the case of a passed though PCI device, there is
a hardware IOMMU which does not support all pages sizes from the above -
P8 cannot do 2MB and P9 cannot do 16MB. So for each emulated
16M IOMMU page we may create several smaller mappings ("TCEs") in
the hardware IOMMU.

The code wrongly uses the emulated TCE index instead of hardware TCE
index in error handling. The problem is easier to see on POWER8 with
multi-level TCE tables (when only the first level is preallocated)
as hash mode uses real mode TCE hypercalls handlers.
The kernel starts using indirect tables when VMs get bigger than 128GB
(depends on the max page order).
The very first real mode hcall is going to fail with H_TOO_HARD as
in the real mode we cannot allocate memory for TCEs (we can in the virtual
mode) but on the way out the code attempts to clear hardware TCEs using
emulated TCE indexes which corrupts random kernel memory because
it_offset==1<<59 is subtracted from those indexes and the resulting index
is out of the TCE table bounds.

This fixes kvmppc_clear_tce() to use the correct TCE indexes.

While at it, this fixes TCE cache invalidation which uses emulated TCE
indexes instead of the hardware ones. This went unnoticed as 64bit DMA
is used these days and VMs map all RAM in one go and only then do DMA
and this is when the TCE cache gets populated.

Potentially this could slow down mapping, however normally 16MB
emulated pages are backed by 64K hardware pages so it is one write to
the "TCE Kill" per 256 updates which is not that bad considering the size
of the cache (1024 TCEs or so).

Fixes: ca1fc489cfa0 ("KVM: PPC: Book3S: Allow backing bigger guest IOMMU pages with smaller physical pages")

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Tested-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220420050840.328223-1-aik@ozlabs.ru
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/powerpc/kvm/book3s_64_vio.c    | 45 +++++++++++++++--------------
 arch/powerpc/kvm/book3s_64_vio_hv.c | 44 ++++++++++++++--------------
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 8da93fdfa59e..c640053ab03f 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -421,13 +421,19 @@ static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
 	tbl[idx % TCES_PER_PAGE] = tce;
 }
 
-static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
-		unsigned long entry)
+static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt,
+		struct iommu_table *tbl, unsigned long entry)
 {
-	unsigned long hpa = 0;
-	enum dma_data_direction dir = DMA_NONE;
+	unsigned long i;
+	unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
+	unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift);
+
+	for (i = 0; i < subpages; ++i) {
+		unsigned long hpa = 0;
+		enum dma_data_direction dir = DMA_NONE;
 
-	iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir);
+		iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir);
+	}
 }
 
 static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -486,6 +492,8 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
 			break;
 	}
 
+	iommu_tce_kill(tbl, io_entry, subpages);
+
 	return ret;
 }
 
@@ -545,6 +553,8 @@ static long kvmppc_tce_iommu_map(struct kvm *kvm,
 			break;
 	}
 
+	iommu_tce_kill(tbl, io_entry, subpages);
+
 	return ret;
 }
 
@@ -591,10 +601,9 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
 					entry, ua, dir);
 
-		iommu_tce_kill(stit->tbl, entry, 1);
 
 		if (ret != H_SUCCESS) {
-			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
+			kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry);
 			goto unlock_exit;
 		}
 	}
@@ -670,13 +679,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		 */
 		if (get_user(tce, tces + i)) {
 			ret = H_TOO_HARD;
-			goto invalidate_exit;
+			goto unlock_exit;
 		}
 		tce = be64_to_cpu(tce);
 
 		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
 			ret = H_PARAMETER;
-			goto invalidate_exit;
+			goto unlock_exit;
 		}
 
 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -685,19 +694,15 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 					iommu_tce_direction(tce));
 
 			if (ret != H_SUCCESS) {
-				kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl,
-						entry);
-				goto invalidate_exit;
+				kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl,
+						 entry + i);
+				goto unlock_exit;
 			}
 		}
 
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
-invalidate_exit:
-	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
-		iommu_tce_kill(stit->tbl, entry, npages);
-
 unlock_exit:
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
@@ -736,20 +741,16 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 				continue;
 
 			if (ret == H_TOO_HARD)
-				goto invalidate_exit;
+				return ret;
 
 			WARN_ON_ONCE(1);
-			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
+			kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i);
 		}
 	}
 
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
-invalidate_exit:
-	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
-		iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index e5ba96c41f3f..57af53a6a2d8 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -247,13 +247,19 @@ static void iommu_tce_kill_rm(struct iommu_table *tbl,
 		tbl->it_ops->tce_kill(tbl, entry, pages, true);
 }
 
-static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl,
-		unsigned long entry)
+static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt,
+		struct iommu_table *tbl, unsigned long entry)
 {
-	unsigned long hpa = 0;
-	enum dma_data_direction dir = DMA_NONE;
+	unsigned long i;
+	unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
+	unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift);
+
+	for (i = 0; i < subpages; ++i) {
+		unsigned long hpa = 0;
+		enum dma_data_direction dir = DMA_NONE;
 
-	iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
+		iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir);
+	}
 }
 
 static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -316,6 +322,8 @@ static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm,
 			break;
 	}
 
+	iommu_tce_kill_rm(tbl, io_entry, subpages);
+
 	return ret;
 }
 
@@ -379,6 +387,8 @@ static long kvmppc_rm_tce_iommu_map(struct kvm *kvm,
 			break;
 	}
 
+	iommu_tce_kill_rm(tbl, io_entry, subpages);
+
 	return ret;
 }
 
@@ -424,10 +434,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
 					stit->tbl, entry, ua, dir);
 
-		iommu_tce_kill_rm(stit->tbl, entry, 1);
-
 		if (ret != H_SUCCESS) {
-			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
+			kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry);
 			return ret;
 		}
 	}
@@ -569,7 +577,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		ua = 0;
 		if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) {
 			ret = H_PARAMETER;
-			goto invalidate_exit;
+			goto unlock_exit;
 		}
 
 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -578,19 +586,15 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 					iommu_tce_direction(tce));
 
 			if (ret != H_SUCCESS) {
-				kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
-						entry);
-				goto invalidate_exit;
+				kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl,
+						entry + i);
+				goto unlock_exit;
 			}
 		}
 
 		kvmppc_rm_tce_put(stt, entry + i, tce);
 	}
 
-invalidate_exit:
-	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
-		iommu_tce_kill_rm(stit->tbl, entry, npages);
-
 unlock_exit:
 	if (!prereg)
 		arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
@@ -632,20 +636,16 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 				continue;
 
 			if (ret == H_TOO_HARD)
-				goto invalidate_exit;
+				return ret;
 
 			WARN_ON_ONCE_RM(1);
-			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
+			kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i);
 		}
 	}
 
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
-invalidate_exit:
-	list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
-		iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages);
-
 	return ret;
 }
 
-- 
Gitee


From c0d651a41c25108ae1ada52d0b1f1a3e3020d53f Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Tue, 26 Jul 2022 17:39:22 +0800
Subject: [PATCH 085/132] drm/vc4: Use pm_runtime_resume_and_get to fix
 pm_runtime_get_sync() usage

stable inclusion
from stable-v5.10.113
commit 0a2cef65b32919af7df4df979c5eede5f7825f17
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0a2cef65b32919af7df4df979c5eede5f7825f17

--------------------------------

[ Upstream commit 3d0b93d92a2790337aa9d18cb332d02356a24126 ]

If the device is already in a runtime PM enabled state
pm_runtime_get_sync() will return 1.

Also, we need to call pm_runtime_put_noidle() when pm_runtime_get_sync()
fails, so use pm_runtime_resume_and_get() instead. this function
will handle this.

Fixes: 4078f5757144 ("drm/vc4: Add DSI driver")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Link: https://patchwork.freedesktop.org/patch/msgid/20220420135008.2757-1-linmq006@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/gpu/drm/vc4/vc4_dsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_dsi.c b/drivers/gpu/drm/vc4/vc4_dsi.c
index eaf276978ee7..ad84b56f4091 100644
--- a/drivers/gpu/drm/vc4/vc4_dsi.c
+++ b/drivers/gpu/drm/vc4/vc4_dsi.c
@@ -835,7 +835,7 @@ static void vc4_dsi_encoder_enable(struct drm_encoder *encoder)
 	unsigned long phy_clock;
 	int ret;
 
-	ret = pm_runtime_get_sync(dev);
+	ret = pm_runtime_resume_and_get(dev);
 	if (ret) {
 		DRM_ERROR("Failed to runtime PM enable on DSI%d\n", dsi->port);
 		return;
-- 
Gitee


From a0820bde54b9a6f88f03400828dd5f0fdfa2051f Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Tue, 26 Jul 2022 17:39:23 +0800
Subject: [PATCH 086/132] powerpc/perf: Fix power9 event alternatives

stable inclusion
from stable-v5.10.113
commit e012f9d1af54ca3c24ca0e9ec03a1a212972771c
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e012f9d1af54ca3c24ca0e9ec03a1a212972771c

--------------------------------

[ Upstream commit 0dcad700bb2776e3886fe0a645a4bf13b1e747cd ]

When scheduling a group of events, there are constraint checks done to
make sure all events can go in a group. Example, one of the criteria is
that events in a group cannot use the same PMC. But platform specific
PMU supports alternative event for some of the event codes. During
perf_event_open(), if any event group doesn't match constraint check
criteria, further lookup is done to find alternative event.

By current design, the array of alternatives events in PMU code is
expected to be sorted by column 0. This is because in
find_alternative() the return criteria is based on event code
comparison. ie. "event < ev_alt[i][0])". This optimisation is there
since find_alternative() can be called multiple times. In power9 PMU
code, the alternative event array is not sorted properly and hence there
is breakage in finding alternative events.

To work with existing logic, fix the alternative event array to be
sorted by column 0 for power9-pmu.c

Results:

With alternative events, multiplexing can be avoided. That is, for
example, in power9 PM_LD_MISS_L1 (0x3e054) has alternative event,
PM_LD_MISS_L1_ALT (0x400f0). This is an identical event which can be
programmed in a different PMC.

Before:

 # perf stat -e r3e054,r300fc

 Performance counter stats for 'system wide':

           1057860      r3e054              (50.21%)
               379      r300fc              (49.79%)

       0.944329741 seconds time elapsed

Since both the events are using PMC3 in this case, they are
multiplexed here.

After:

 # perf stat -e r3e054,r300fc

 Performance counter stats for 'system wide':

           1006948      r3e054
               182      r300fc

Fixes: 91e0bd1e6251 ("powerpc/perf: Add PM_LD_MISS_L1 and PM_BR_2PATH to power9 event list")
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220419114828.89843-1-atrajeev@linux.vnet.ibm.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 arch/powerpc/perf/power9-pmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 2a57e93a79dc..7245355bee28 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -133,11 +133,11 @@ int p9_dd22_bl_ev[] = {
 
 /* Table of alternatives, sorted by column 0 */
 static const unsigned int power9_event_alternatives[][MAX_ALT] = {
-	{ PM_INST_DISP,			PM_INST_DISP_ALT },
-	{ PM_RUN_CYC_ALT,		PM_RUN_CYC },
-	{ PM_RUN_INST_CMPL_ALT,		PM_RUN_INST_CMPL },
-	{ PM_LD_MISS_L1,		PM_LD_MISS_L1_ALT },
 	{ PM_BR_2PATH,			PM_BR_2PATH_ALT },
+	{ PM_INST_DISP,			PM_INST_DISP_ALT },
+	{ PM_RUN_CYC_ALT,               PM_RUN_CYC },
+	{ PM_LD_MISS_L1,                PM_LD_MISS_L1_ALT },
+	{ PM_RUN_INST_CMPL_ALT,         PM_RUN_INST_CMPL },
 };
 
 static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-- 
Gitee


From ce66add4b99dde3edf99491d41ea364b6005683c Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Tue, 26 Jul 2022 17:39:24 +0800
Subject: [PATCH 087/132] perf report: Set PERF_SAMPLE_DATA_SRC bit for Arm SPE
 event

stable inclusion
from stable-v5.10.113
commit 19590bbc691d81f03d2a24a3ec30c399ebe071e0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19590bbc691d81f03d2a24a3ec30c399ebe071e0

--------------------------------

[ Upstream commit ccb17caecfbd542f49a2a79ae088136ba8bfb794 ]

Since commit bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem
info is not available") "perf mem report" and "perf report --mem-mode"
don't report result if the PERF_SAMPLE_DATA_SRC bit is missed in sample
type.

The commit ffab487052054162 ("perf: arm-spe: Fix perf report
--mem-mode") partially fixes the issue.  It adds PERF_SAMPLE_DATA_SRC
bit for Arm SPE event, this allows the perf data file generated by
kernel v5.18-rc1 or later version can be reported properly.

On the other hand, perf tool still fails to be backward compatibility
for a data file recorded by an older version's perf which contains Arm
SPE trace data.  This patch is a workaround in reporting phase, when
detects ARM SPE PMU event and without PERF_SAMPLE_DATA_SRC bit, it will
force to set the bit in the sample type and give a warning info.

Fixes: bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available")
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Tested-by: German Gomez <german.gomez@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Link: https://lore.kernel.org/r/20220414123201.842754-1-leo.yan@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 tools/perf/builtin-report.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 91cab5cdfbc1..b55ee073c2f7 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -340,6 +340,7 @@ static int report__setup_sample_type(struct report *rep)
 	struct perf_session *session = rep->session;
 	u64 sample_type = evlist__combined_sample_type(session->evlist);
 	bool is_pipe = perf_data__is_pipe(session->data);
+	struct evsel *evsel;
 
 	if (session->itrace_synth_opts->callchain ||
 	    session->itrace_synth_opts->add_callchain ||
@@ -394,6 +395,19 @@ static int report__setup_sample_type(struct report *rep)
 	}
 
 	if (sort__mode == SORT_MODE__MEMORY) {
+		/*
+		 * FIXUP: prior to kernel 5.18, Arm SPE missed to set
+		 * PERF_SAMPLE_DATA_SRC bit in sample type.  For backward
+		 * compatibility, set the bit if it's an old perf data file.
+		 */
+		evlist__for_each_entry(session->evlist, evsel) {
+			if (strstr(evsel->name, "arm_spe") &&
+				!(sample_type & PERF_SAMPLE_DATA_SRC)) {
+				evsel->core.attr.sample_type |= PERF_SAMPLE_DATA_SRC;
+				sample_type |= PERF_SAMPLE_DATA_SRC;
+			}
+		}
+
 		if (!is_pipe && !(sample_type & PERF_SAMPLE_DATA_SRC)) {
 			ui__error("Selected --mem-mode but no mem data. "
 				  "Did you call perf record without -d?\n");
-- 
Gitee


From 97cada6b51f5737aba63d7d3f69cf413a3c30a1b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 26 Jul 2022 17:39:25 +0800
Subject: [PATCH 088/132] ext4: fix fallocate to use file_modified to update
 permissions consistently

stable inclusion
from stable-v5.10.113
commit f6038d43b25bba1cd50d2a77e207f6550aee9954
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f6038d43b25bba1cd50d2a77e207f6550aee9954

--------------------------------

commit ad5cd4f4ee4d5fcdb1bfb7a0c073072961e70783 upstream.

Since the initial introduction of (posix) fallocate back at the turn of
the century, it has been possible to use this syscall to change the
user-visible contents of files.  This can happen by extending the file
size during a preallocation, or through any of the newer modes (punch,
zero, collapse, insert range).  Because the call can be used to change
file contents, we should treat it like we do any other modification to a
file -- update the mtime, and drop set[ug]id privileges/capabilities.

The VFS function file_modified() does all this for us if pass it a
locked inode, so let's make fallocate drop permissions correctly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20220308185043.GA117678@magnolia
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/extents.c | 32 +++++++++++++++++++++++++-------
 fs/ext4/inode.c   |  7 ++++++-
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 277f89d5de03..df43cda8fc2f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2908,7 +2908,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
 extern int ext4_truncate(struct inode *);
 extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
 extern void ext4_set_inode_flags(struct inode *, bool init);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4323186bae78..9d06695c04ab 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4510,9 +4510,9 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	return ret > 0 ? ret2 : ret;
 }
 
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
 
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
 
 static long ext4_zero_range(struct file *file, loff_t offset,
 			    loff_t len, int mode)
@@ -4583,6 +4583,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_mutex;
+
 	/* Preallocate the range including the unaligned edges */
 	if (partial_begin || partial_end) {
 		ret = ext4_alloc_file_blocks(file,
@@ -4707,17 +4711,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		goto exit;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
-		ret = ext4_punch_hole(inode, offset, len);
+		ret = ext4_punch_hole(file, offset, len);
 		goto exit;
 	}
 
 	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
-		ret = ext4_collapse_range(inode, offset, len);
+		ret = ext4_collapse_range(file, offset, len);
 		goto exit;
 	}
 
 	if (mode & FALLOC_FL_INSERT_RANGE) {
-		ret = ext4_insert_range(inode, offset, len);
+		ret = ext4_insert_range(file, offset, len);
 		goto exit;
 	}
 
@@ -4753,6 +4757,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out;
+
 	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
 	if (ret)
 		goto out;
@@ -5255,8 +5263,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
  * This implements the fallocate's collapse range functionality for ext4
  * Returns: 0 and non-zero on error.
  */
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t punch_start, punch_stop;
 	handle_t *handle;
@@ -5307,6 +5316,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	/* Wait for existing dio to complete */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_mutex;
+
 	/*
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
@@ -5401,8 +5414,9 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
  * by len bytes.
  * Returns 0 on success, error otherwise.
  */
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	handle_t *handle;
 	struct ext4_ext_path *path;
@@ -5458,6 +5472,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	/* Wait for existing dio to complete */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_mutex;
+
 	/*
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e85c238edd85..69fcf82f2c81 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3972,8 +3972,9 @@ int ext4_break_layouts(struct inode *inode)
  * Returns: 0 on success or negative on failure
  */
 
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 {
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t first_block, stop_block;
 	struct address_space *mapping = inode->i_mapping;
@@ -4026,6 +4027,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	inode_dio_wait(inode);
 
+	ret = file_modified(file);
+	if (ret)
+		goto out_mutex;
+
 	/*
 	 * Prevent page faults from reinstantiating pages we have released from
 	 * page cache.
-- 
Gitee


From aaf9e2faf5bb4ea97264d9f45f6c6e35f73f17e8 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@linaro.org>
Date: Tue, 26 Jul 2022 17:39:26 +0800
Subject: [PATCH 089/132] ext4: limit length to bitmap_maxbytes - blocksize in
 punch_hole

stable inclusion
from stable-v5.10.113
commit 22c450d39f8922ae26de459cf4f83b2b294f207e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=22c450d39f8922ae26de459cf4f83b2b294f207e

--------------------------------

commit 2da376228a2427501feb9d15815a45dbdbdd753e upstream.

Syzbot found an issue [1] in ext4_fallocate().
The C reproducer [2] calls fallocate(), passing size 0xffeffeff000ul,
and offset 0x1000000ul, which, when added together exceed the
bitmap_maxbytes for the inode. This triggers a BUG in
ext4_ind_remove_space(). According to the comments in this function
the 'end' parameter needs to be one block after the last block to be
removed. In the case when the BUG is triggered it points to the last
block. Modify the ext4_punch_hole() function and add constraint that
caps the length to satisfy the one before laster block requirement.

LINK: [1] https://syzkaller.appspot.com/bug?id=b80bd9cf348aac724a4f4dff251800106d721331
LINK: [2] https://syzkaller.appspot.com/text?tag=ReproC&x=14ba0238700000

Fixes: a4bb6b64e39a ("ext4: enable "punch hole" functionality")
Reported-by: syzbot+7a806094edd5d07ba029@syzkaller.appspotmail.com
Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
Link: https://lore.kernel.org/r/20220331200515.153214-1-tadeusz.struk@linaro.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/ext4/inode.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 69fcf82f2c81..f4e0c7cc4820 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3978,7 +3978,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t first_block, stop_block;
 	struct address_space *mapping = inode->i_mapping;
-	loff_t first_block_offset, last_block_offset;
+	loff_t first_block_offset, last_block_offset, max_length;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	handle_t *handle;
 	unsigned int credits;
 	int ret = 0, ret2 = 0;
@@ -4012,6 +4013,14 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 		   offset;
 	}
 
+	/*
+	 * For punch hole the length + offset needs to be within one block
+	 * before last range. Adjust the length if it goes beyond that limit.
+	 */
+	max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+	if (offset + length > max_length)
+		length = max_length - offset;
+
 	if (offset & (sb->s_blocksize - 1) ||
 	    (offset + length) & (sb->s_blocksize - 1)) {
 		/*
-- 
Gitee


From cf4de22f4fd1ee1f6367a12848686f71d1293010 Mon Sep 17 00:00:00 2001
From: "wangjianjian (C)" <wangjianjian3@huawei.com>
Date: Tue, 26 Jul 2022 17:39:27 +0800
Subject: [PATCH 090/132] ext4, doc: fix incorrect h_reserved size

stable inclusion
from stable-v5.10.113
commit 0c54b093766becb9c83317232c93290cf612b8ff
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0c54b093766becb9c83317232c93290cf612b8ff

--------------------------------

commit 7102ffe4c166ca0f5e35137e9f9de83768c2d27d upstream.

According to document and code, ext4_xattr_header's size is 32 bytes, so
h_reserved size should be 3.

Signed-off-by: Wang Jianjian <wangjianjian3@huawei.com>
Link: https://lore.kernel.org/r/92fcc3a6-7d77-8c09-4126-377fcb4c46a5@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 Documentation/filesystems/ext4/attributes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/filesystems/ext4/attributes.rst b/Documentation/filesystems/ext4/attributes.rst
index 54386a010a8d..871d2da7a0a9 100644
--- a/Documentation/filesystems/ext4/attributes.rst
+++ b/Documentation/filesystems/ext4/attributes.rst
@@ -76,7 +76,7 @@ The beginning of an extended attribute block is in
      - Checksum of the extended attribute block.
    * - 0x14
      - \_\_u32
-     - h\_reserved[2]
+     - h\_reserved[3]
      - Zero.
 
 The checksum is calculated against the FS UUID, the 64-bit block number
-- 
Gitee


From 27d40cc66fb275b8c38be0766f63cd14118d4423 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 26 Jul 2022 17:39:28 +0800
Subject: [PATCH 091/132] ext4: fix overhead calculation to account for the
 reserved gdt blocks

stable inclusion
from stable-v5.10.113
commit 4789149b9ea2a1893c62d816742f1a76514fc901
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4789149b9ea2a1893c62d816742f1a76514fc901

--------------------------------

commit 10b01ee92df52c8d7200afead4d5e5f55a5c58b1 upstream.

The kernel calculation was underestimating the overhead by not taking
into account the reserved gdt blocks.  With this change, the overhead
calculated by the kernel matches the overhead calculation in mke2fs.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/ext4/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26c938360895..34e4e00445da 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3912,9 +3912,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
 	ext4_fsblk_t		first_block, last_block, b;
 	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
 	int			s, j, count = 0;
+	int			has_super = ext4_bg_has_super(sb, grp);
 
 	if (!ext4_has_feature_bigalloc(sb))
-		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+		return (has_super + ext4_bg_num_gdb(sb, grp) +
+			(has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
 			sbi->s_itb_per_group + 2);
 
 	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
-- 
Gitee


From fd9c2ff88c62fbb32351bc8165f473a54e91817c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 26 Jul 2022 17:39:29 +0800
Subject: [PATCH 092/132] ext4: force overhead calculation if the
 s_overhead_cluster makes no sense

stable inclusion
from stable-v5.10.113
commit e1e96e37272156d691203a3725b876787f38c8f2
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e1e96e37272156d691203a3725b876787f38c8f2

--------------------------------

commit 85d825dbf4899a69407338bae462a59aa9a37326 upstream.

If the file system does not use bigalloc, calculating the overhead is
cheap, so force the recalculation of the overhead so we don't have to
trust the precalculated overhead in the superblock.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 fs/ext4/super.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 34e4e00445da..706a159817ee 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4977,9 +4977,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * Get the # of file system overhead blocks from the
 	 * superblock if present.
 	 */
-	if (es->s_overhead_clusters)
-		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
-	else {
+	sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+	/* ignore the precalculated value if it is ridiculous */
+	if (sbi->s_overhead > ext4_blocks_count(es))
+		sbi->s_overhead = 0;
+	/*
+	 * If the bigalloc feature is not enabled recalculating the
+	 * overhead doesn't take long, so we might as well just redo
+	 * it to make sure we are using the correct value.
+	 */
+	if (!ext4_has_feature_bigalloc(sb))
+		sbi->s_overhead = 0;
+	if (sbi->s_overhead == 0) {
 		err = ext4_calculate_overhead(sb);
 		if (err)
 			goto failed_mount_wq;
-- 
Gitee


From f120da4c1f854aadb526a8fd86e3caf0c9fdbe5e Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 26 Jul 2022 17:39:30 +0800
Subject: [PATCH 093/132] can: isotp: stop timeout monitoring when no first
 frame was sent

stable inclusion
from stable-v5.10.113
commit 50aac44273600cb0ae1efd010bb1de7701444a41
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=50aac44273600cb0ae1efd010bb1de7701444a41

--------------------------------

commit d73497081710c876c3c61444445512989e102152 upstream.

The first attempt to fix a the 'impossible' WARN_ON_ONCE(1) in
isotp_tx_timer_handler() focussed on the identical CAN IDs created by
the syzbot reproducer and lead to upstream fix/commit 3ea566422cbd
("can: isotp: sanitize CAN ID checks in isotp_bind()"). But this did
not catch the root cause of the wrong tx.state in the tx_timer handler.

In the isotp 'first frame' case a timeout monitoring needs to be started
before the 'first frame' is send. But when this sending failed the timeout
monitoring for this specific frame has to be disabled too.

Otherwise the tx_timer is fired with the 'warn me' tx.state of ISOTP_IDLE.

Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol")
Link: https://lore.kernel.org/all/20220405175112.2682-1-socketcan@hartkopp.net
Reported-by: syzbot+2339c27f5c66c652843e@syzkaller.appspotmail.com
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 net/can/isotp.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/can/isotp.c b/net/can/isotp.c
index 9a4a9c5a9f24..c515bbd46c67 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -864,6 +864,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	struct canfd_frame *cf;
 	int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
 	int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0;
+	s64 hrtimer_sec = 0;
 	int off;
 	int err;
 
@@ -962,7 +963,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 		isotp_create_fframe(cf, so, ae);
 
 		/* start timeout for FC */
-		hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
+		hrtimer_sec = 1;
+		hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0),
+			      HRTIMER_MODE_REL_SOFT);
 	}
 
 	/* send the first or only CAN frame */
@@ -975,6 +978,11 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	if (err) {
 		pr_notice_once("can-isotp: %s: can_send_ret %d\n",
 			       __func__, err);
+
+		/* no transmission -> no timeout monitoring */
+		if (hrtimer_sec)
+			hrtimer_cancel(&so->txtimer);
+
 		goto err_out_drop;
 	}
 
-- 
Gitee


From 0c5ab04a2f48e4ea5814c339410bc679062b41ea Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@microchip.com>
Date: Tue, 26 Jul 2022 17:39:31 +0800
Subject: [PATCH 094/132] spi: atmel-quadspi: Fix the buswidth adjustment
 between spi-mem and controller

stable inclusion
from stable-v5.10.113
commit dccee748af17fc087ff5017152e532ef8e18c8c0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=dccee748af17fc087ff5017152e532ef8e18c8c0

--------------------------------

commit 8c235cc25087495c4288d94f547e9d3061004991 upstream.

Use the spi_mem_default_supports_op() core helper in order to take into
account the buswidth specified by the user in device tree.

Cc: <stable@vger.kernel.org>
Fixes: 0e6aae08e9ae ("spi: Add QuadSPI driver for Atmel SAMA5D2")
Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Link: https://lore.kernel.org/r/20220406133604.455356-1-tudor.ambarus@microchip.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/spi/atmel-quadspi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c
index 1e63fd4821f9..8aa89d93db11 100644
--- a/drivers/spi/atmel-quadspi.c
+++ b/drivers/spi/atmel-quadspi.c
@@ -277,6 +277,9 @@ static int atmel_qspi_find_mode(const struct spi_mem_op *op)
 static bool atmel_qspi_supports_op(struct spi_mem *mem,
 				   const struct spi_mem_op *op)
 {
+	if (!spi_mem_default_supports_op(mem, op))
+		return false;
+
 	if (atmel_qspi_find_mode(op) < 0)
 		return false;
 
-- 
Gitee


From dd76b5fe66faa3615f8d284d1fd07f1cccabf125 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 26 Jul 2022 17:39:32 +0800
Subject: [PATCH 095/132] staging: ion: Prevent incorrect reference counting
 behavour

stable inclusion
from stable-v5.10.113
commit fea24b07edfc348c67a019b6e17b39c0698e631f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fea24b07edfc348c67a019b6e17b39c0698e631f

--------------------------------

Supply additional check in order to prevent unexpected results.

Fixes: b892bf75b2034 ("ion: Switch ion to use dma-buf")
Suggested-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/staging/android/ion/ion.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index e1fe03ceb7f1..e6d4a3ee6cda 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -114,6 +114,9 @@ static void *ion_buffer_kmap_get(struct ion_buffer *buffer)
 	void *vaddr;
 
 	if (buffer->kmap_cnt) {
+		if (buffer->kmap_cnt == INT_MAX)
+			return ERR_PTR(-EOVERFLOW);
+
 		buffer->kmap_cnt++;
 		return buffer->vaddr;
 	}
-- 
Gitee


From 9a1fc0d8feaf050654872afaa9e66323f12e4a11 Mon Sep 17 00:00:00 2001
From: Khazhismel Kumykov <khazhy@google.com>
Date: Tue, 26 Jul 2022 17:39:33 +0800
Subject: [PATCH 096/132] block/compat_ioctl: fix range check in BLKGETSIZE

stable inclusion
from stable-v5.10.113
commit 8bedbc8f7f35f533000b347644b6bf1f62524676
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8bedbc8f7f35f533000b347644b6bf1f62524676

--------------------------------

commit ccf16413e520164eb718cf8b22a30438da80ff23 upstream.

kernel ulong and compat_ulong_t may not be same width. Use type directly
to eliminate mismatches.

This would result in truncation rather than EFBIG for 32bit mode for
large disks.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220414224056.2875681-1-khazhy@google.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 block/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 8171858dc8a9..7c578f84a4fd 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -683,7 +683,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			       (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
 	case BLKGETSIZE:
 		size = i_size_read(bdev->bd_inode);
-		if ((size >> 9) > ~0UL)
+		if ((size >> 9) > ~(compat_ulong_t)0)
 			return -EFBIG;
 		return compat_put_ulong(argp, size >> 9);
 
-- 
Gitee


From 156b297acb77278b3f11077a9e8f92e81c87aebe Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Tue, 26 Jul 2022 17:39:34 +0800
Subject: [PATCH 097/132] Revert "net: micrel: fix KS8851_MLL Kconfig"

stable inclusion
from stable-v5.10.113
commit 7992fdb045fbc7cb0e34eba464b73044585c0638
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7992fdb045fbc7cb0e34eba464b73044585c0638

--------------------------------

This reverts commit 1ff5359afa5ec0dd09fe76183dc4fa24b50e4125 which is
commit c3efcedd272aa6dd5929e20cf902a52ddaa1197a upstream.

The upstream commit c3efcedd272a ("net: micrel: fix KS8851_MLL Kconfig")
depends on e5f31552674e ("ethernet: fix PTP_1588_CLOCK dependencies")
which is not part of Linux 5.10.y . Revert the aforementioned commit to
prevent breakage in 5.10.y .

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: <stable@vger.kernel.org> # 5.10.x
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Acked-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/net/ethernet/micrel/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig
index 9ceb7e1fb169..42bc014136fe 100644
--- a/drivers/net/ethernet/micrel/Kconfig
+++ b/drivers/net/ethernet/micrel/Kconfig
@@ -37,7 +37,6 @@ config KS8851
 config KS8851_MLL
 	tristate "Micrel KS8851 MLL"
 	depends on HAS_IOMEM
-	depends on PTP_1588_CLOCK_OPTIONAL
 	select MII
 	select CRC32
 	select EEPROM_93CX6
-- 
Gitee


From 03a99552eb74a789de7de2a13b0ea9b31dcf2dfa Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Thu, 28 Jul 2022 18:06:31 +0800
Subject: [PATCH 098/132] bpf: Don't redirect packets with invalid pkt_len

mainline inclusion
from mainline-v5.19-rc6
commit fd1894224407c484f652ad456e1ce423e89bb3eb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5HWKR
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fd1894224407

--------------------------------

Syzbot found an issue [1]: fq_codel_drop() try to drop a flow whitout any
skbs, that is, the flow->head is null.
The root cause, as the [2] says, is because that bpf_prog_test_run_skb()
run a bpf prog which redirects empty skbs.
So we should determine whether the length of the packet modified by bpf
prog or others like bpf_prog_test is valid before forwarding it directly.

LINK: [1] https://syzkaller.appspot.com/bug?id=0b84da80c2917757915afa89f7738a9d16ec96c5
LINK: [2] https://www.spinics.net/lists/netdev/msg777503.html

Reported-by: syzbot+7a12909485b94426aceb@syzkaller.appspotmail.com
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220715115559.139691-1-shaozhengchao@huawei.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com>
Reviewed-by: Yue Haibing <yuehaibing@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/skbuff.h | 8 ++++++++
 net/bpf/test_run.c     | 3 +++
 net/core/dev.c         | 1 +
 3 files changed, 12 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 68efccc15a87..72cfb047fd2c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2251,6 +2251,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
 
 #endif /* NET_SKBUFF_DATA_USES_OFFSET */
 
+static inline void skb_assert_len(struct sk_buff *skb)
+{
+#ifdef CONFIG_DEBUG_NET
+	if (WARN_ONCE(!skb->len, "%s\n", __func__))
+		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+#endif /* CONFIG_DEBUG_NET */
+}
+
 /*
  *	Add data to an sk_buff
  */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 99712d35e535..f266a9453c8e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -398,6 +398,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
 {
 	struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
 
+	if (!skb->len)
+		return -EINVAL;
+
 	if (!__skb)
 		return 0;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 12089c484b30..8e4de36eede8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4094,6 +4094,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	bool again = false;
 
 	skb_reset_mac_header(skb);
+	skb_assert_len(skb);
 
 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
-- 
Gitee


From 92561effc05f39fbf33d80fd0f9e5978fb7b3a71 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Thu, 28 Jul 2022 18:06:32 +0800
Subject: [PATCH 099/132] ucounts: add missing data type changes

mainline inclusion
from mainline-v5.14-rc6
commit f153c2246783ba210493054d99c66353f56423c9
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5IDIC
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f153c2246783ba210493054d99c66353f56423c9

--------------------------------

commit f9c82a4ea89c3 ("Increase size of ucounts to atomic_long_t")
changed the data type of ucounts/ucounts_max to long, but missed to
adjust a few other places. This is noticeable on big endian platforms
from user space because the /proc/sys/user/max_*_names files all
contain 0.

v4 - Made the min and max constants long so the sysctl values
     are actually settable on little endian machines.
     -- EWB

Fixes: f9c82a4ea89c ("Increase size of ucounts to atomic_long_t")
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Acked-by: Alexey Gladkov <legion@kernel.org>
v1: https://lkml.kernel.org/r/20210721115800.910778-1-svens@linux.ibm.com
v2: https://lkml.kernel.org/r/20210721125233.1041429-1-svens@linux.ibm.com
v3: https://lkml.kernel.org/r/20210730062854.3601635-1-svens@linux.ibm.com
Link: https://lkml.kernel.org/r/8735rijqlv.fsf_-_@disp2133
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

Conflict:
  fs/notify/fanotify/fanotify_user.c

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/notify/inotify/inotify_user.c | 17 +++++++++++------
 kernel/ucount.c                  | 19 +++++++++++--------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5f6c6bf65909..f13b64729d08 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -46,22 +46,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
 #include <linux/sysctl.h>
 
+static long it_zero = 0;
+static long it_int_max = INT_MAX;
+
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
 		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &it_zero,
+		.extra2		= &it_int_max,
 	},
 	{
 		.procname	= "max_user_watches",
 		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &it_zero,
+		.extra2		= &it_int_max,
 	},
 	{
 		.procname	= "max_queued_events",
diff --git a/kernel/ucount.c b/kernel/ucount.c
index dff1d9b739d2..1f5825b674d8 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -52,14 +52,17 @@ static struct ctl_table_root set_root = {
 	.permissions = set_permissions,
 };
 
-#define UCOUNT_ENTRY(name)				\
-	{						\
-		.procname	= name,			\
-		.maxlen		= sizeof(int),		\
-		.mode		= 0644,			\
-		.proc_handler	= proc_dointvec_minmax,	\
-		.extra1		= SYSCTL_ZERO,		\
-		.extra2		= SYSCTL_INT_MAX,	\
+static long ue_zero = 0;
+static long ue_int_max = INT_MAX;
+
+#define UCOUNT_ENTRY(name)					\
+	{							\
+		.procname	= name,				\
+		.maxlen		= sizeof(long),			\
+		.mode		= 0644,				\
+		.proc_handler	= proc_doulongvec_minmax,	\
+		.extra1		= &ue_zero,			\
+		.extra2		= &ue_int_max,			\
 	}
 static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_user_namespaces"),
-- 
Gitee


From a51eb65ba1c5c5fcf7c026732d5ace1f9cbaf8f9 Mon Sep 17 00:00:00 2001
From: Luo Meng <luomeng12@huawei.com>
Date: Thu, 28 Jul 2022 18:06:33 +0800
Subject: [PATCH 100/132] block: Fix warning in bd_link_disk_holder()

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB
CVE: NA

--------------------------------

Warning reports as follows:

 WARNING: CPU: 3 PID: 674 at fs/block_dev.c:1272 bd_link_disk_holder+0xcd/0x270
 Modules linked in: null_blk(+)
 CPU: 3 PID: 674 Comm: dmsetup Not tainted 5.10.0-16691-gf6076432827d-dirty #158
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4
 RIP: 0010:bd_link_disk_holder+0xcd/0x270
 Code: 69 73 ee 00 44 89 e8 5b 48 83 05 c5 bf 6d 0c 01 5d 41 5c 41 5d 41 5e 41 8
 RSP: 0018:ffffc9000049bbb8 EFLAGS: 00010202
 RAX: ffff888104e39038 RBX: ffff888104185000 RCX: 0000000000000000
 RDX: 0000000000000001 RSI: ffffffffaa085692 RDI: 0000000000000000
 RBP: ffff88810cc2ae00 R08: ffffffffa853659b R09: 0000000000000000
 R10: ffffc9000049bbb0 R11: 720030626c6c756e R12: ffff88810e800000
 R13: ffff88810e800090 R14: ffff888103570c98 R15: ffff888103570c80
 FS:  00007fb49dc13dc0(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007ff994ebde70 CR3: 000000010d54a000 CR4: 00000000000006e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  dm_get_table_device+0x175/0x300
  dm_get_device+0x238/0x360
  linear_ctr+0xee/0x170
  dm_table_add_target+0x199/0x4b0
  table_load+0x18c/0x480
  ? table_clear+0x190/0x190
  ctl_ioctl+0x21d/0x640
  ? check_preemption_disabled+0x140/0x150
  dm_ctl_ioctl+0x12/0x20
  __se_sys_ioctl+0xb1/0x100
  __x64_sys_ioctl+0x1e/0x30
  do_syscall_64+0x45/0x70
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

This can reproduce by concurrent operations:
	1. modprobe null_blk
	2. echo -e "0 10000 linear /dev/nullb0 0" > table
	   dmsetup create xxx table

t1: create disk a                   |     t2: dm setup
                                    |
device_add_disk                     |
 dev->devt = devt                   |
                        	    | dm_get_table_device
                        	    | open_table_device
                        	    | blkdev_get_by_dev -> succeed
				    | bd_link_disk_holder
                        	    |  -> holder_dir is still NULL
 register_disk -> create holder_dir
  kobject_create_and_add

device_add_disk() will set devt before creating holder_dir, which
leaves a window that dm_get_table_device() can find the disk by
devt while it's holder_dir is NULL.

So move GENHD_FL_UP in blk_register_queue() to avoid this warning and
fix a NULL-ptr in  __blk_mq_sched_bio_merge().

Signed-off-by: Luo Meng <luomeng12@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-sysfs.c | 6 ++++++
 block/genhd.c     | 2 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0a4fcbda8ab4..aff53c3ae836 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -910,6 +910,12 @@ int blk_register_queue(struct gendisk *disk)
 		kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
 	mutex_unlock(&q->sysfs_lock);
 
+
+	/*
+	 * Set the flag at last, so that block devcie can't be opened
+	 * before it's registration is done.
+	 */
+	disk->flags |= GENHD_FL_UP;
 	ret = 0;
 unlock:
 	mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/genhd.c b/block/genhd.c
index 8b37fcfa10d1..9d91f880ea95 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -799,8 +799,6 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
 	WARN_ON(!disk->minors &&
 		!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
 
-	disk->flags |= GENHD_FL_UP;
-
 	retval = blk_alloc_devt(&disk->part0, &devt);
 	if (retval) {
 		WARN_ON(1);
-- 
Gitee


From d7c2ddc8456d91991dbc87da304742d60d577b41 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 28 Jul 2022 18:06:34 +0800
Subject: [PATCH 101/132] block: fix that part scan is disabled in
 device_add_disk()

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB
CVE: NA

--------------------------------

Patch ("block: Fix warning in bd_link_disk_holder()") moves the
setting of flag 'GENHD_FL_UP' behind blkdev_get, which will
disabled part scan:

devcie_add_disk
 register_disk
  blkdev_get
   __blkdev_get
    bdev_get_gendisk
     get_gendisk -> failed because 'GENHD_FL_UP' is not set

And this will cause tests block/017, block/018 and scsi/004 to fail.

Fix the problem by moving part scan as well.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-sysfs.c | 33 +++++++++++++++++++++++++++++++++
 block/genhd.c     | 27 ---------------------------
 2 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index aff53c3ae836..f7cd16cec0ed 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -821,6 +821,38 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+static void disk_scan_partitions(struct gendisk *disk)
+{
+	struct block_device *bdev;
+
+	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
+		return;
+
+	set_bit(GD_NEED_PART_SCAN, &disk->state);
+	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
+	if (!IS_ERR(bdev))
+		blkdev_put(bdev, FMODE_READ);
+}
+
+static void disk_init_partition(struct gendisk *disk)
+{
+	struct device *ddev = disk_to_dev(disk);
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+
+	disk_scan_partitions(disk);
+
+	/* announce disk after possible partitions are created */
+	dev_set_uevent_suppress(ddev, 0);
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+	disk_part_iter_exit(&piter);
+}
+
 /**
  * blk_register_queue - register a block layer queue with sysfs
  * @disk: Disk of which the request queue should be registered with sysfs.
@@ -916,6 +948,7 @@ int blk_register_queue(struct gendisk *disk)
 	 * before it's registration is done.
 	 */
 	disk->flags |= GENHD_FL_UP;
+	disk_init_partition(disk);
 	ret = 0;
 unlock:
 	mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/genhd.c b/block/genhd.c
index 9d91f880ea95..021c9c2d7231 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -687,25 +687,10 @@ static int exact_lock(dev_t devt, void *data)
 	return 0;
 }
 
-static void disk_scan_partitions(struct gendisk *disk)
-{
-	struct block_device *bdev;
-
-	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
-		return;
-
-	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
-	if (!IS_ERR(bdev))
-		blkdev_put(bdev, FMODE_READ);
-}
-
 static void register_disk(struct device *parent, struct gendisk *disk,
 			  const struct attribute_group **groups)
 {
 	struct device *ddev = disk_to_dev(disk);
-	struct disk_part_iter piter;
-	struct hd_struct *part;
 	int err;
 
 	ddev->parent = parent;
@@ -743,18 +728,6 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 	if (disk->flags & GENHD_FL_HIDDEN)
 		return;
 
-	disk_scan_partitions(disk);
-
-	/* announce disk after possible partitions are created */
-	dev_set_uevent_suppress(ddev, 0);
-	kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
-	/* announce possible partitions */
-	disk_part_iter_init(&piter, disk, 0);
-	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-	disk_part_iter_exit(&piter);
-
 	if (disk->queue->backing_dev_info->dev) {
 		err = sysfs_create_link(&ddev->kobj,
 			  &disk->queue->backing_dev_info->dev->kobj,
-- 
Gitee


From 67b9d2775f9adafd9a4f5d8a7cc05f9cce884f1a Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 28 Jul 2022 18:06:35 +0800
Subject: [PATCH 102/132] block: prevent lockdep false positive warning about
 'bd_mutex'

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB
CVE: NA

--------------------------------

Patch ("block: fix that part scan is disabled in device_add_disk()")
confuse lockdep to produce following warning:

=====================================================
WARNING: possible circular locking dependency detected
4.18.0+ #2 Tainted: G                 ---------r-  -
------------------------------------------------------
syz-executor.0/4652 is trying to acquire lock:
00000000ad5f5a19 (&mddev->open_mutex){+.+.}, at: md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626

but task is already holding lock:
000000005c3a3fea (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&bdev->bd_mutex){+.+.}:
       __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline]
       __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072
       __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583
       blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735
       disk_init_partition home/install/linux-rh-3-10/block/blk-sysfs.c:972 [inline]
       blk_register_queue+0x5ed/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1055
       __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729
       sd_probe_async+0x447/0x852 home/install/linux-rh-3-10/drivers/scsi/sd.c:3249
       async_run_entry_fn+0xe1/0x700 home/install/linux-rh-3-10/kernel/async.c:127
       process_one_work+0x9cf/0x1940 home/install/linux-rh-3-10/kernel/workqueue.c:2175
       worker_thread+0x91/0xc50 home/install/linux-rh-3-10/kernel/workqueue.c:2321
       kthread+0x33a/0x400 home/install/linux-rh-3-10/kernel/kthread.c:257
       ret_from_fork+0x3a/0x50 home/install/linux-rh-3-10/arch/x86/entry/entry_64.S:355

-> #1 (&q->sysfs_dir_lock){+.+.}:
       __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline]
       __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072
       blk_register_queue+0x143/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1010
       __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729
       add_disk home/install/linux-rh-3-10/./include/linux/genhd.h:447 [inline]
       md_alloc+0xb06/0x10d0 home/install/linux-rh-3-10/drivers/md/md.c:5525
       md_probe+0x32/0x60 home/install/linux-rh-3-10/drivers/md/md.c:5554
       kobj_lookup+0x2d2/0x450 home/install/linux-rh-3-10/drivers/base/map.c:152
       get_gendisk+0x3b/0x360 home/install/linux-rh-3-10/block/genhd.c:860
       bdev_get_gendisk home/install/linux-rh-3-10/fs/block_dev.c:1181 [inline]
       __blkdev_get+0x3b6/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1578
       blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735
       blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923
       do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777
       do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline]
       path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578
       do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613
       do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075
       do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298
       entry_SYSCALL_64_after_hwframe+0x6a/0xdf

-> #0 (&mddev->open_mutex){+.+.}:
       lock_acquire+0x10b/0x3a0 home/install/linux-rh-3-10/kernel/locking/lockdep.c:3868
       __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline]
       __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072
       md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626
       __blkdev_get+0x2dc/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1599
       blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735
       blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923
       do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777
       do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline]
       path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578
       do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613
       do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075
       do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298
       entry_SYSCALL_64_after_hwframe+0x6a/0xdf

other info that might help us debug this:

Chain exists of:
  &mddev->open_mutex --> &q->sysfs_dir_lock --> &bdev->bd_mutex

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&bdev->bd_mutex);
                               lock(&q->sysfs_dir_lock);
                               lock(&bdev->bd_mutex);
  lock(&mddev->open_mutex);

 *** DEADLOCK ***

Since 'bd_mutex' and 'sysfs_dir_lock' is different is for each device,
deadlock between md_open() and sd_probe_async() is impossible. However,
lockdep is treating 'bd_mutex' and 'sysfs_dir_lock' from different devices
the same, and patch "block: fix that part scan is disabled in
device_add_disk()" is holding 'bd_mutex' inside 'sysfs_dir_lock',
which causes the false positive warning.

Fix the false positive warning by don't grab 'bd_mutex' inside
'sysfs_dir_lock'.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 block/blk-sysfs.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f7cd16cec0ed..548d758365c6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -948,10 +948,17 @@ int blk_register_queue(struct gendisk *disk)
 	 * before it's registration is done.
 	 */
 	disk->flags |= GENHD_FL_UP;
-	disk_init_partition(disk);
 	ret = 0;
 unlock:
 	mutex_unlock(&q->sysfs_dir_lock);
+	/*
+	 * Init partitions after releasing 'sysfs_dir_lock', otherwise lockdep
+	 * will be confused because it will treat 'bd_mutex' from different
+	 * devices as the same lock.
+	 */
+	if (!ret)
+		disk_init_partition(disk);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_register_queue);
-- 
Gitee


From 77eabe0b93e9dd68b400d52c9caa9282ca6094df Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 28 Jul 2022 18:06:36 +0800
Subject: [PATCH 103/132] inotify: show inotify mask flags in proc fdinfo

mainline inclusion
from mainline-v5.19-rc1
commit a32e697cda27679a0327ae2cafdad8c7170f548f
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5IHD1
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a32e697cda27679a0327ae2cafdad8c7170f548f

--------------------------------

The inotify mask flags IN_ONESHOT and IN_EXCL_UNLINK are not "internal
to kernel" and should be exposed in procfs fdinfo so CRIU can restore
them.

Fixes: 6933599697c9 ("inotify: hide internal kernel bits from fdinfo")
Link: https://lore.kernel.org/r/20220422120327.3459282-2-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/notify/fdinfo.c               | 11 ++---------
 fs/notify/inotify/inotify.h      | 12 ++++++++++++
 fs/notify/inotify/inotify_user.c |  2 +-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index f0d6b54be412..765b50aeadd2 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -83,16 +83,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
 	inode = igrab(fsnotify_conn_inode(mark->connector));
 	if (inode) {
-		/*
-		 * IN_ALL_EVENTS represents all of the mask bits
-		 * that we expose to userspace.  There is at
-		 * least one bit (FS_EVENT_ON_CHILD) which is
-		 * used only internally to the kernel.
-		 */
-		u32 mask = mark->mask & IN_ALL_EVENTS;
-		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ",
 			   inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
-			   mask, mark->ignored_mask);
+			   inotify_mark_user_mask(mark));
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
 		iput(inode);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 2007e3711916..8f00151eb731 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -22,6 +22,18 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
 	return container_of(fse, struct inotify_event_info, fse);
 }
 
+/*
+ * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to
+ * userspace.  There is at least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+#define INOTIFY_USER_MASK (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)
+
+static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
+{
+	return fsn_mark->mask & INOTIFY_USER_MASK;
+}
+
 extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 					   struct fsnotify_group *group);
 extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark,
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f13b64729d08..3986f1877457 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -93,7 +93,7 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
 		mask |= FS_EVENT_ON_CHILD;
 
 	/* mask off the flags used to open the fd */
-	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
+	mask |= (arg & INOTIFY_USER_MASK);
 
 	return mask;
 }
-- 
Gitee


From c248fb22ed21b7bf4308bfcdf962d11b4f73f62c Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 28 Jul 2022 18:06:37 +0800
Subject: [PATCH 104/132] fbcon: Disallow setting font bigger than screen size

stable inclusion
from stable-v5.10.130
commit b727561ddc9360de9631af2d970d8ffed676a750
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M
CVE: CVE-2021-33655

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b727561ddc9360de9631af2d970d8ffed676a750

--------------------------------

commit 65a01e601dbba8b7a51a2677811f70f783766682 upstream.

Prevent that users set a font size which is bigger than the physical screen.
It's unlikely this may happen (because screens are usually much larger than the
fonts and each font char is limited to 32x32 pixels), but it may happen on
smaller screens/LCD displays.

Signed-off-by: Helge Deller <deller@gmx.de>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: stable@vger.kernel.org # v4.14+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/video/fbdev/core/fbcon.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index f102519ccefb..8d81e9321cf7 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2510,6 +2510,11 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font,
 	if (charcount != 256 && charcount != 512)
 		return -EINVAL;
 
+	/* font bigger than screen resolution ? */
+	if (w > FBCON_SWAP(info->var.rotate, info->var.xres, info->var.yres) ||
+	    h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres))
+		return -EINVAL;
+
 	/* Make sure drawing engine can handle the font */
 	if (!(info->pixmap.blit_x & (1 << (font->width - 1))) ||
 	    !(info->pixmap.blit_y & (1 << (font->height - 1))))
-- 
Gitee


From e664e9804a489f76e72be7601f53654b7d29014f Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 28 Jul 2022 18:06:38 +0800
Subject: [PATCH 105/132] fbcon: Prevent that screen size is smaller than font
 size

stable inclusion
from stable-v5.10.130
commit cecb806c766c78e1be62b6b7b1483ef59bbaeabe
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M
CVE: CVE-2021-33655

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cecb806c766c78e1be62b6b7b1483ef59bbaeabe

--------------------------------

commit e64242caef18b4a5840b0e7a9bff37abd4f4f933 upstream.

We need to prevent that users configure a screen size which is smaller than the
currently selected font size. Otherwise rendering chars on the screen will
access memory outside the graphics memory region.

This patch adds a new function fbcon_modechange_possible() which
implements this check and which later may be extended with other checks
if necessary.  The new function is called from the FBIOPUT_VSCREENINFO
ioctl handler in fbmem.c, which will return -EINVAL if userspace asked
for a too small screen size.

Signed-off-by: Helge Deller <deller@gmx.de>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/video/fbdev/core/fbcon.c | 28 ++++++++++++++++++++++++++++
 drivers/video/fbdev/core/fbmem.c |  4 +++-
 include/linux/fbcon.h            |  4 ++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 8d81e9321cf7..b4260a830e78 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2776,6 +2776,34 @@ void fbcon_update_vcs(struct fb_info *info, bool all)
 }
 EXPORT_SYMBOL(fbcon_update_vcs);
 
+/* let fbcon check if it supports a new screen resolution */
+int fbcon_modechange_possible(struct fb_info *info, struct fb_var_screeninfo *var)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct vc_data *vc;
+	unsigned int i;
+
+	WARN_CONSOLE_UNLOCKED();
+
+	if (!ops)
+		return 0;
+
+	/* prevent setting a screen size which is smaller than font size */
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		vc = vc_cons[i].d;
+		if (!vc || vc->vc_mode != KD_TEXT ||
+			   registered_fb[con2fb_map[i]] != info)
+			continue;
+
+		if (vc->vc_font.width  > FBCON_SWAP(var->rotate, var->xres, var->yres) ||
+		    vc->vc_font.height > FBCON_SWAP(var->rotate, var->yres, var->xres))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fbcon_modechange_possible);
+
 int fbcon_mode_deleted(struct fb_info *info,
 		       struct fb_videomode *mode)
 {
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 00939ca2065a..d2cffaf3fda8 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1109,7 +1109,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			return -EFAULT;
 		console_lock();
 		lock_fb_info(info);
-		ret = fb_set_var(info, &var);
+		ret = fbcon_modechange_possible(info, &var);
+		if (!ret)
+			ret = fb_set_var(info, &var);
 		if (!ret)
 			fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL);
 		unlock_fb_info(info);
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index ff5596dd30f8..2382dec6d6ab 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -15,6 +15,8 @@ void fbcon_new_modelist(struct fb_info *info);
 void fbcon_get_requirement(struct fb_info *info,
 			   struct fb_blit_caps *caps);
 void fbcon_fb_blanked(struct fb_info *info, int blank);
+int  fbcon_modechange_possible(struct fb_info *info,
+			       struct fb_var_screeninfo *var);
 void fbcon_update_vcs(struct fb_info *info, bool all);
 void fbcon_remap_all(struct fb_info *info);
 int fbcon_set_con2fb_map_ioctl(void __user *argp);
@@ -33,6 +35,8 @@ static inline void fbcon_new_modelist(struct fb_info *info) {}
 static inline void fbcon_get_requirement(struct fb_info *info,
 					 struct fb_blit_caps *caps) {}
 static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {}
+static inline int  fbcon_modechange_possible(struct fb_info *info,
+				struct fb_var_screeninfo *var) { return 0; }
 static inline void fbcon_update_vcs(struct fb_info *info, bool all) {}
 static inline void fbcon_remap_all(struct fb_info *info) {}
 static inline int fbcon_set_con2fb_map_ioctl(void __user *argp) { return 0; }
-- 
Gitee


From 9ba05bbca9335251bfdaa95a344034c31969c8a0 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 28 Jul 2022 18:06:39 +0800
Subject: [PATCH 106/132] fbmem: Check virtual screen sizes in fb_set_var()

stable inclusion
from stable-v5.10.130
commit b81212828ad19ab3eccf00626cd04099215060bf
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M
CVE: CVE-2021-33655

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b81212828ad19ab3eccf00626cd04099215060bf

--------------------------------

commit 6c11df58fd1ac0aefcb3b227f72769272b939e56 upstream.

Verify that the fbdev or drm driver correctly adjusted the virtual
screen sizes. On failure report the failing driver and reject the screen
size change.

Signed-off-by: Helge Deller <deller@gmx.de>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chen Jun <chenjun102@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 drivers/video/fbdev/core/fbmem.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index d2cffaf3fda8..3b3ccb235522 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1019,6 +1019,16 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 	if (ret)
 		return ret;
 
+	/* verify that virtual resolution >= physical resolution */
+	if (var->xres_virtual < var->xres ||
+	    var->yres_virtual < var->yres) {
+		pr_warn("WARNING: fbcon: Driver '%s' missed to adjust virtual screen size (%ux%u vs. %ux%u)\n",
+			info->fix.id,
+			var->xres_virtual, var->yres_virtual,
+			var->xres, var->yres);
+		return -EINVAL;
+	}
+
 	if ((var->activate & FB_ACTIVATE_MASK) != FB_ACTIVATE_NOW)
 		return 0;
 
-- 
Gitee


From 89e9ad6dfaef94bddf0aa0dcd1e2c0bd69dcbb57 Mon Sep 17 00:00:00 2001
From: Eric Snowberg <eric.snowberg@oracle.com>
Date: Thu, 28 Jul 2022 18:06:40 +0800
Subject: [PATCH 107/132] lockdown: Fix kexec lockdown bypass with ima policy

mainline inclusion
from mainline-v5.19-rc8
commit 543ce63b664e2c2f9533d089a4664b559c3e6b5b
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5I0FP
CVE: CVE-2022-21505

Reference: https://seclists.org/oss-sec/2022/q3/57
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=543ce63b664e2c2f9533d089a4664b559c3e6b5b

--------------------------------

The lockdown LSM is primarily used in conjunction with UEFI Secure Boot.
This LSM may also be used on machines without UEFI.  It can also be
enabled when UEFI Secure Boot is disabled.  One of lockdown's features
is to prevent kexec from loading untrusted kernels.  Lockdown can be
enabled through a bootparam or after the kernel has booted through
securityfs.

If IMA appraisal is used with the "ima_appraise=log" boot param,
lockdown can be defeated with kexec on any machine when Secure Boot is
disabled or unavailable.  IMA prevents setting "ima_appraise=log" from
the boot param when Secure Boot is enabled, but this does not cover
cases where lockdown is used without Secure Boot.

To defeat lockdown, boot without Secure Boot and add ima_appraise=log to
the kernel command line; then:

  $ echo "integrity" > /sys/kernel/security/lockdown
  $ echo "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig" > \
    /sys/kernel/security/ima/policy
  $ kexec -ls unsigned-kernel

Add a call to verify ima appraisal is set to "enforce" whenever lockdown
is enabled.  This fixes CVE-2022-21505.

Cc: stable@vger.kernel.org
Fixes: 29d3c1c8dfe7 ("kexec: Allow kexec_file() with appropriate IMA policy when locked down")
Signed-off-by: Eric Snowberg <eric.snowberg@oracle.com>
Acked-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: John Haxby <john.haxby@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: GUO Zihua <guozihua@huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Wang Weiyang <wangweiyang2@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 security/integrity/ima/ima_policy.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 5a06050729a7..b1ab4b3d99fb 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -1900,6 +1900,10 @@ bool ima_appraise_signature(enum kernel_read_file_id id)
 	if (id >= READING_MAX_ID)
 		return false;
 
+	if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE)
+	    && security_locked_down(LOCKDOWN_KEXEC))
+		return false;
+
 	func = read_idmap[id] ?: FILE_CHECK;
 
 	rcu_read_lock();
-- 
Gitee


From 6658b33b26920a82df3203c6ecefd00b5820e921 Mon Sep 17 00:00:00 2001
From: ZhaoLong Wang <wangzhaolong1@huawei.com>
Date: Thu, 28 Jul 2022 18:06:41 +0800
Subject: [PATCH 108/132] ubifs: Fix the issue that UBIFS be read-only due to
 truncate in the encrypted directory

hulk inclusion
category: bugfix
bugzilla: 187163, https://gitee.com/openeuler/kernel/issues/I5GBC4
CVE: NA

--------------------------------

The ubifs_compress() function does not compress the data When the
data length is short than 128 bytes or the compressed data length
is not ideal.It cause that the compressed length of the truncated
data in the truncate_data_node() function may be greater than the
length of the raw data read from the flash.

The above two lengths are transferred to the ubifs_encrypt()
function as parameters. This may lead to assertion fails and then
the file system becomes read-only.

This patch use the actual length of the data in the memory as the
input parameter for assert comparison, which avoids the problem.

Signed-off-by: ZhaoLong Wang <wangzhaolong1@huawei.com>
Reviewed-by: zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/ubifs/crypto.c  | 11 +++++++++++
 fs/ubifs/journal.c | 29 +++++++++++++++++------------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 22be7aeb96c4..1dc22cf29c65 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode)
 	return ubifs_check_dir_empty(inode) == 0;
 }
 
+/**
+ * ubifs_encrypt - Encrypt data.
+ * @inode: inode which refers to the data node
+ * @dn: data node to encrypt
+ * @in_len: length of data to be compressed
+ * @out_len: allocated memory size for the data area of @dn
+ * @block: logical block number of the block
+ *
+ * This function encrypt a possibly-compressed data in the data node.
+ * The encrypted data length will store in @out_len.
+ */
 int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
 		  unsigned int in_len, unsigned int *out_len, int block)
 {
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 72586512e51f..ee9888087983 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1472,23 +1472,25 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
  * @block: data block number
  * @dn: data node to re-compress
  * @new_len: new length
+ * @dn_size: size of the data node @dn in memory
  *
  * This function is used when an inode is truncated and the last data node of
  * the inode has to be re-compressed/encrypted and re-written.
  */
 static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode,
 			      unsigned int block, struct ubifs_data_node *dn,
-			      int *new_len)
+			      int *new_len, int dn_size)
 {
 	void *buf;
-	int err, dlen, compr_type, out_len, old_dlen;
+	int err, dlen, compr_type, out_len, data_size;
 
 	out_len = le32_to_cpu(dn->size);
 	buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS);
 	if (!buf)
 		return -ENOMEM;
 
-	dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	data_size = dn_size - UBIFS_DATA_NODE_SZ;
 	compr_type = le16_to_cpu(dn->compr_type);
 
 	if (IS_ENCRYPTED(inode)) {
@@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in
 	}
 
 	if (IS_ENCRYPTED(inode)) {
-		err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block);
+		err = ubifs_encrypt(inode, dn, out_len, &data_size, block);
 		if (err)
 			goto out;
 
-		out_len = old_dlen;
+		out_len = data_size;
 	} else {
 		dn->compr_size = 0;
 	}
@@ -1549,7 +1551,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	struct ubifs_ino_node *ino;
 	struct ubifs_trun_node *trun;
 	struct ubifs_data_node *dn;
-	int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+	int err, dlen, len, lnum, offs, bit, sz, dn_size, sync = IS_SYNC(inode);
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	ino_t inum = inode->i_ino;
 	unsigned int blk;
@@ -1562,10 +1564,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	ubifs_assert(c, S_ISREG(inode->i_mode));
 	ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
 
-	sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
-	     UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+	dn_size = COMPRESSED_DATA_NODE_BUF_SZ;
+
+	if (IS_ENCRYPTED(inode))
+		dn_size += UBIFS_CIPHER_BLOCK_SIZE;
 
-	sz += ubifs_auth_node_sz(c);
+	sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+	     dn_size + ubifs_auth_node_sz(c);
 
 	ino = kmalloc(sz, GFP_NOFS);
 	if (!ino)
@@ -1596,15 +1601,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 			if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) {
 				ubifs_err(c, "bad data node (block %u, inode %lu)",
 					  blk, inode->i_ino);
-				ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ -
-						UBIFS_TRUN_NODE_SZ);
+				ubifs_dump_node(c, dn, dn_size);
 				goto out_free;
 			}
 
 			if (dn_len <= dlen)
 				dlen = 0; /* Nothing to do */
 			else {
-				err = truncate_data_node(c, inode, blk, dn, &dlen);
+				err = truncate_data_node(c, inode, blk, dn,
+						&dlen, dn_size);
 				if (err)
 					goto out_free;
 			}
-- 
Gitee


From baf97eb4ed59819bb8e2d59570057de458a351d1 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Thu, 28 Jul 2022 18:06:42 +0800
Subject: [PATCH 109/132] ubifs: Fix AA deadlock when setting xattr for
 encrypted file

hulk inclusion
category: bugfix
bugzilla: 187250, https://gitee.com/openeuler/kernel/issues/I5HSMS
CVE: NA

-------------------------------------------------

Following process:
vfs_setxattr(host)
  ubifs_xattr_set
    down_write(host_ui->xattr_sem)   <- lock first time
      create_xattr
        ubifs_new_inode(host)
          fscrypt_prepare_new_inode(host)
            fscrypt_policy_to_inherit(host)
              if (IS_ENCRYPTED(inode))
                fscrypt_require_key(host)
                  fscrypt_get_encryption_info(host)
                    ubifs_xattr_get(host)
                      down_read(host_ui->xattr_sem) <- AA deadlock

, which may trigger an AA deadlock problem:

[  102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds.
[  102.625298]       Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711
[  102.628732] task:setfattr        state:D stack:    0 pid: 1599
[  102.628749] Call Trace:
[  102.628753]  <TASK>
[  102.628776]  __schedule+0x482/0x1060
[  102.629964]  schedule+0x92/0x1a0
[  102.629976]  rwsem_down_read_slowpath+0x287/0x8c0
[  102.629996]  down_read+0x84/0x170
[  102.630585]  ubifs_xattr_get+0xd1/0x370 [ubifs]
[  102.630730]  ubifs_crypt_get_context+0x1f/0x30 [ubifs]
[  102.630791]  fscrypt_get_encryption_info+0x7d/0x1c0
[  102.630810]  fscrypt_policy_to_inherit+0x56/0xc0
[  102.630817]  fscrypt_prepare_new_inode+0x35/0x160
[  102.630830]  ubifs_new_inode+0xcc/0x4b0 [ubifs]
[  102.630873]  ubifs_xattr_set+0x591/0x9f0 [ubifs]
[  102.630961]  xattr_set+0x8c/0x3e0 [ubifs]
[  102.631003]  __vfs_setxattr+0x71/0xc0
[  102.631026]  vfs_setxattr+0x105/0x270
[  102.631034]  do_setxattr+0x6d/0x110
[  102.631041]  setxattr+0xa0/0xd0
[  102.631087]  __x64_sys_setxattr+0x2f/0x40

Fetch a reproducer in [Link].

Just like ext4 does, which skips encrypting for inode with
EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260
Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...")
Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/ubifs/dir.c   | 25 ++++++++++++++-----------
 fs/ubifs/ubifs.h |  2 +-
 fs/ubifs/xattr.c |  2 +-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f5777f59a101..acd7e83a35e4 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode)
  * @c: UBIFS file-system description object
  * @dir: parent directory inode
  * @mode: inode mode flags
+ * @is_xattr: whether the inode is xattr inode
  *
  * This function finds an unused inode number, allocates new inode and
  * initializes it. Returns new inode in case of success and an error code in
  * case of failure.
  */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
-			      umode_t mode)
+			      umode_t mode, bool is_xattr)
 {
 	int err;
 	struct inode *inode;
@@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			 current_time(inode);
 	inode->i_mapping->nrpages = 0;
 
-	err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
-	if (err) {
-		ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
-		goto out_iput;
+	if (!is_xattr) {
+		err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+		if (err) {
+			ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
+			goto out_iput;
+		}
 	}
 
 	switch (mode & S_IFMT) {
@@ -308,7 +311,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
-	inode = ubifs_new_inode(c, dir, mode);
+	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fname;
@@ -369,7 +372,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
 	if (err)
 		return ERR_PTR(err);
 
-	inode = ubifs_new_inode(c, dir, mode);
+	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_free;
@@ -461,7 +464,7 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
 		return err;
 	}
 
-	inode = ubifs_new_inode(c, dir, mode);
+	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_budg;
@@ -1002,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
-	inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+	inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fname;
@@ -1089,7 +1092,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
-	inode = ubifs_new_inode(c, dir, mode);
+	inode = ubifs_new_inode(c, dir, mode, false);
 	if (IS_ERR(inode)) {
 		kfree(dev);
 		err = PTR_ERR(inode);
@@ -1171,7 +1174,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 
 	sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
-	inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+	inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fname;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 9a4a3191ed07..15cfee7c1125 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1996,7 +1996,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
 
 /* dir.c */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
-			      umode_t mode);
+			      umode_t mode, bool is_xattr);
 int ubifs_getattr(const struct path *path, struct kstat *stat,
 		  u32 request_mask, unsigned int flags);
 int ubifs_check_dir_empty(struct inode *dir);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 258b97939ba2..81efe638acd5 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	if (err)
 		return err;
 
-	inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+	inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_budg;
-- 
Gitee


From 156e60a42ffffe594f7cf38df51ee07b7f0fc99c Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Thu, 28 Jul 2022 18:06:43 +0800
Subject: [PATCH 110/132] Revert "mm/dynamic_hugetlb: disable dynamic hugetlb
 if hugetlb_vmemmap is enabled"

hulk inclusion
category: bugfix
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

--------------------------------

Will disable hugetlb_vmemmap when dynamic hugetlb is enabled in later patch.

This reverts commit c7ae7c0dd37aa112f9cb3e23878f4c5fe67f86bc.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/dynamic_hugetlb.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c
index eb9b528b73de..f8ebc8ab7d60 100644
--- a/mm/dynamic_hugetlb.c
+++ b/mm/dynamic_hugetlb.c
@@ -1133,17 +1133,6 @@ void __init dynamic_hugetlb_init(void)
 	if (!enable_dhugetlb)
 		return;
 
-	/*
-	 * The dynamic_hugetlb feature need to split and merge pages frequently.
-	 * hugetlb_vmemmap will affects the perforemance of page split and merge.
-	 * If want to use dynamic hugetlb, please disable hugetlb_vmemmap.
-	 */
-	if (hugetlb_free_vmemmap_enabled) {
-		enable_dhugetlb = false;
-		pr_info("Please set hugetlb_free_vmemmap=off if want to enable dynamic hugetlb\n");
-		return;
-	}
-
 	count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT);
 	size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *);
 	dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL);
-- 
Gitee


From 2706bcc0550ce47604e18a68f6bd8d0f78f0136d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 28 Jul 2022 18:06:44 +0800
Subject: [PATCH 111/132] jump_label: Provide CONFIG-driven build state
 defaults

mainline inclusion
from mainline-v5.13-rc1
commit 0d66ccc1627013c95f1e7ef10b95b8451cd7834e
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0d66ccc1627013c95f1e7ef10b95b8451cd7834e

--------------------------------

As shown in the comment in jump_label.h, choosing the initial state of
static branches changes the assembly layout. If the condition is expected
to be likely it's inline, and if unlikely it is out of line via a jump.

A few places in the kernel use (or could be using) a CONFIG to choose the
default state, which would give a small performance benefit to their
compile-time declared default. Provide the infrastructure to do this.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210401232347.2791257-2-keescook@chromium.org
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/jump_label.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 7e1dce5670fc..d01c23025af0 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -388,6 +388,21 @@ struct static_key_false {
 		[0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,	\
 	}
 
+#define _DEFINE_STATIC_KEY_1(name)	DEFINE_STATIC_KEY_TRUE(name)
+#define _DEFINE_STATIC_KEY_0(name)	DEFINE_STATIC_KEY_FALSE(name)
+#define DEFINE_STATIC_KEY_MAYBE(cfg, name)			\
+	__PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name)
+
+#define _DEFINE_STATIC_KEY_RO_1(name)	DEFINE_STATIC_KEY_TRUE_RO(name)
+#define _DEFINE_STATIC_KEY_RO_0(name)	DEFINE_STATIC_KEY_FALSE_RO(name)
+#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name)			\
+	__PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name)
+
+#define _DECLARE_STATIC_KEY_1(name)	DECLARE_STATIC_KEY_TRUE(name)
+#define _DECLARE_STATIC_KEY_0(name)	DECLARE_STATIC_KEY_FALSE(name)
+#define DECLARE_STATIC_KEY_MAYBE(cfg, name)			\
+	__PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name)
+
 extern bool ____wrong_branch_error(void);
 
 #define static_key_enabled(x)							\
@@ -488,6 +503,10 @@ extern bool ____wrong_branch_error(void);
 
 #endif /* CONFIG_JUMP_LABEL */
 
+#define static_branch_maybe(config, x)					\
+	(IS_ENABLED(config) ? static_branch_likely(x)			\
+			    : static_branch_unlikely(x))
+
 /*
  * Advanced usage; refcount, branch is enabled when: count != 0
  */
-- 
Gitee


From 2722aa58c0fe3bd8c73b80786b1120e317c41365 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Jul 2022 18:06:45 +0800
Subject: [PATCH 112/132] mm: make compound_head const-preserving

mainline inclusion
from mainline-v5.14-rc1
commit 0f2317e34e2c7b97efd4600122115410795ebeea
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0f2317e34e2c7b97efd4600122115410795ebeea

--------------------------------

If you pass a const pointer to compound_head(), you get a const pointer
back; if you pass a mutable pointer, you get a mutable pointer back.  Also
remove an unnecessary forward definition of struct page; we're about to
dereference page->compound_head, so it must already have been defined.

Link: https://lkml.kernel.org/r/20210416231531.2521383-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/page-flags.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 65e1cbe1d1ce..b40b4e0ada31 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -194,17 +194,17 @@ enum pageflags {
 
 #ifndef __GENERATING_BOUNDS_H
 
-struct page;	/* forward declaration */
-
-static inline struct page *compound_head(struct page *page)
+static inline unsigned long _compound_head(const struct page *page)
 {
 	unsigned long head = READ_ONCE(page->compound_head);
 
 	if (unlikely(head & 1))
-		return (struct page *) (head - 1);
-	return page;
+		return head - 1;
+	return (unsigned long)page;
 }
 
+#define compound_head(page)	((typeof(page))_compound_head(page))
+
 static __always_inline int PageTail(struct page *page)
 {
 	return READ_ONCE(page->compound_head) & 1;
-- 
Gitee


From fe0aed0230fbd8246fb028d126548fa69b4f9bf4 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:46 +0800
Subject: [PATCH 113/132] mm: hugetlb: free the 2nd vmemmap page associated
 with each HugeTLB page

mainline inclusion
from mainline-v5.18-rc1
commit e7d324850bfcb30df563d144c0363cc44595277d
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e7d324850bfcb30df563d144c0363cc44595277d

--------------------------------

Patch series "Free the 2nd vmemmap page associated with each HugeTLB
page", v7.

This series can minimize the overhead of struct page for 2MB HugeTLB
pages significantly.  It further reduces the overhead of struct page by
12.5% for a 2MB HugeTLB compared to the previous approach, which means
2GB per 1TB HugeTLB.  It is a nice gain.  Comments and reviews are
welcome.  Thanks.

The main implementation and details can refer to the commit log of patch
1.  In this series, I have changed the following four helpers, the
following table shows the impact of the overhead of those helpers.

	+------------------+-----------------------+
	|       APIs       | head page | tail page |
	+------------------+-----------+-----------+
	|    PageHead()    |     Y     |     N     |
	+------------------+-----------+-----------+
	|    PageTail()    |     Y     |     N     |
	+------------------+-----------+-----------+
	|  PageCompound()  |     N     |     N     |
	+------------------+-----------+-----------+
	|  compound_head() |     Y     |     N     |
	+------------------+-----------+-----------+

	Y: Overhead is increased.
	N: Overhead is _NOT_ increased.

It shows that the overhead of those helpers on a tail page don't change
between "hugetlb_free_vmemmap=on" and "hugetlb_free_vmemmap=off".  But the
overhead on a head page will be increased when "hugetlb_free_vmemmap=on"
(except PageCompound()).  So I believe that Matthew Wilcox's folio series
will help with this.

The users of PageHead() and PageTail() are much less than compound_head()
and most users of PageTail() are VM_BUG_ON(), so I have done some tests
about the overhead of compound_head() on head pages.

I have tested the overhead of calling compound_head() on a head page,
which is 2.11ns (Measure the call time of 10 million times
compound_head(), and then average).

For a head page whose address is not aligned with PAGE_SIZE or a
non-compound page, the overhead of compound_head() is 2.54ns which is
increased by 20%.  For a head page whose address is aligned with
PAGE_SIZE, the overhead of compound_head() is 2.97ns which is increased by
40%.  Most pages are the former.  I do not think the overhead is
significant since the overhead of compound_head() itself is low.

This patch (of 5):

This patch minimizes the overhead of struct page for 2MB HugeTLB pages
significantly.  It further reduces the overhead of struct page by 12.5%
for a 2MB HugeTLB compared to the previous approach, which means 2GB per
1TB HugeTLB (2MB type).

After the feature of "Free sonme vmemmap pages of HugeTLB page" is
enabled, the mapping of the vmemmap addresses associated with a 2MB
HugeTLB page becomes the figure below.

     HugeTLB                    struct pages(8 pages)         page frame(8 pages)
 +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+---> PG_head
 |           |                     |     0     | -------------> |     0     |
 |           |                     +-----------+                +-----------+
 |           |                     |     1     | -------------> |     1     |
 |           |                     +-----------+                +-----------+
 |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
 |           |                     +-----------+                   | | | | |
 |           |                     |     3     | ------------------+ | | | |
 |           |                     +-----------+                     | | | |
 |           |                     |     4     | --------------------+ | | |
 |    2MB    |                     +-----------+                       | | |
 |           |                     |     5     | ----------------------+ | |
 |           |                     +-----------+                         | |
 |           |                     |     6     | ------------------------+ |
 |           |                     +-----------+                           |
 |           |                     |     7     | --------------------------+
 |           |                     +-----------+
 |           |
 |           |
 |           |
 +-----------+

As we can see, the 2nd vmemmap page frame (indexed by 1) is reused and
remaped. However, the 2nd vmemmap page frame is also can be freed to
the buddy allocator, then we can change the mapping from the figure
above to the figure below.

    HugeTLB                    struct pages(8 pages)         page frame(8 pages)
 +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+---> PG_head
 |           |                     |     0     | -------------> |     0     |
 |           |                     +-----------+                +-----------+
 |           |                     |     1     | ---------------^ ^ ^ ^ ^ ^ ^
 |           |                     +-----------+                  | | | | | |
 |           |                     |     2     | -----------------+ | | | | |
 |           |                     +-----------+                    | | | | |
 |           |                     |     3     | -------------------+ | | | |
 |           |                     +-----------+                      | | | |
 |           |                     |     4     | ---------------------+ | | |
 |    2MB    |                     +-----------+                        | | |
 |           |                     |     5     | -----------------------+ | |
 |           |                     +-----------+                          | |
 |           |                     |     6     | -------------------------+ |
 |           |                     +-----------+                            |
 |           |                     |     7     | ---------------------------+
 |           |                     +-----------+
 |           |
 |           |
 |           |
 +-----------+

After we do this, all tail vmemmap pages (1-7) are mapped to the head
vmemmap page frame (0).  In other words, there are more than one page
struct with PG_head associated with each HugeTLB page.  We __know__ that
there is only one head page struct, the tail page structs with PG_head are
fake head page structs.  We need an approach to distinguish between those
two different types of page structs so that compound_head(), PageHead()
and PageTail() can work properly if the parameter is the tail page struct
but with PG_head.

The following code snippet describes how to distinguish between real and
fake head page struct.

	if (test_bit(PG_head, &page->flags)) {
		unsigned long head = READ_ONCE(page[1].compound_head);

		if (head & 1) {
			if (head == (unsigned long)page + 1)
				==> head page struct
			else
				==> tail page struct
		} else
			==> head page struct
	}

We can safely access the field of the @page[1] with PG_head because the
@page is a compound page composed with at least two contiguous pages.

[songmuchun@bytedance.com: restore lost comment changes]

Link: https://lkml.kernel.org/r/20211101031651.75851-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20211101031651.75851-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	include/linux/page-flags.h
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 .../admin-guide/kernel-parameters.txt         |  2 +-
 include/linux/page-flags.h                    | 73 ++++++++++++++++++-
 mm/hugetlb_vmemmap.c                          | 62 +++++++++-------
 mm/sparse-vmemmap.c                           | 21 ++++++
 4 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2b04cf8fbab4..e0957b73f63d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1596,7 +1596,7 @@
 			[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 			enabled.
 			Allows heavy hugetlb users to free up some more
-			memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
 			Format: { on | off (default) }
 
 			on:  enable the feature
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b40b4e0ada31..7b31a81be7e1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -194,25 +194,82 @@ enum pageflags {
 
 #ifndef __GENERATING_BOUNDS_H
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+extern bool hugetlb_free_vmemmap_enabled;
+
+/*
+ * If the feature of freeing some vmemmap pages associated with each HugeTLB
+ * page is enabled, the head vmemmap page frame is reused and all of the tail
+ * vmemmap addresses map to the head vmemmap page frame (furture details can
+ * refer to the figure at the head of the mm/hugetlb_vmemmap.c).  In other
+ * words, there are more than one page struct with PG_head associated with each
+ * HugeTLB page.  We __know__ that there is only one head page struct, the tail
+ * page structs with PG_head are fake head page structs.  We need an approach
+ * to distinguish between those two different types of page structs so that
+ * compound_head() can return the real head page struct when the parameter is
+ * the tail page struct but with PG_head.
+ *
+ * The page_fixed_fake_head() returns the real head page struct if the @page is
+ * fake page head, otherwise, returns @page which can either be a true page
+ * head or tail.
+ */
+static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
+{
+	if (!hugetlb_free_vmemmap_enabled)
+		return page;
+
+	/*
+	 * Only addresses aligned with PAGE_SIZE of struct page may be fake head
+	 * struct page. The alignment check aims to avoid access the fields (
+	 * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly)
+	 * cold cacheline in some cases.
+	 */
+	if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
+	    test_bit(PG_head, &page->flags)) {
+		/*
+		 * We can safely access the field of the @page[1] with PG_head
+		 * because the @page is a compound page composed with at least
+		 * two contiguous pages.
+		 */
+		unsigned long head = READ_ONCE(page[1].compound_head);
+
+		if (likely(head & 1))
+			return (const struct page *)(head - 1);
+	}
+	return page;
+}
+#else
+static inline const struct page *page_fixed_fake_head(const struct page *page)
+{
+	return page;
+}
+#endif
+
+static __always_inline int page_is_fake_head(struct page *page)
+{
+	return page_fixed_fake_head(page) != page;
+}
+
 static inline unsigned long _compound_head(const struct page *page)
 {
 	unsigned long head = READ_ONCE(page->compound_head);
 
 	if (unlikely(head & 1))
 		return head - 1;
-	return (unsigned long)page;
+	return (unsigned long)page_fixed_fake_head(page);
 }
 
 #define compound_head(page)	((typeof(page))_compound_head(page))
 
 static __always_inline int PageTail(struct page *page)
 {
-	return READ_ONCE(page->compound_head) & 1;
+	return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page);
 }
 
 static __always_inline int PageCompound(struct page *page)
 {
-	return test_bit(PG_head, &page->flags) || PageTail(page);
+	return test_bit(PG_head, &page->flags) ||
+	       READ_ONCE(page->compound_head) & 1;
 }
 
 #define	PAGE_POISON_PATTERN	-1l
@@ -600,7 +657,15 @@ static inline void set_page_writeback_keepwrite(struct page *page)
 	test_set_page_writeback_keepwrite(page);
 }
 
-__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
+static __always_inline int PageHead(struct page *page)
+{
+	PF_POISONED_CHECK(page);
+	return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
+}
+
+__SETPAGEFLAG(Head, head, PF_ANY)
+__CLEARPAGEFLAG(Head, head, PF_ANY)
+CLEARPAGEFLAG(Head, head, PF_ANY)
 
 static __always_inline void set_compound_head(struct page *page, struct page *head)
 {
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c540c21e26f5..4977f5a520c2 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -124,9 +124,9 @@
  * page of page structs (page 0) associated with the HugeTLB page contains the 4
  * page structs necessary to describe the HugeTLB. The only use of the remaining
  * pages of page structs (page 1 to page 7) is to point to page->compound_head.
- * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
+ * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
  * will be used for each HugeTLB page. This will allow us to free the remaining
- * 6 pages to the buddy allocator.
+ * 7 pages to the buddy allocator.
  *
  * Here is how things look after remapping.
  *
@@ -134,30 +134,30 @@
  * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
  * |           |                     |     0     | -------------> |     0     |
  * |           |                     +-----------+                +-----------+
- * |           |                     |     1     | -------------> |     1     |
- * |           |                     +-----------+                +-----------+
- * |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
- * |           |                     +-----------+                   | | | | |
- * |           |                     |     3     | ------------------+ | | | |
- * |           |                     +-----------+                     | | | |
- * |           |                     |     4     | --------------------+ | | |
- * |    PMD    |                     +-----------+                       | | |
- * |   level   |                     |     5     | ----------------------+ | |
- * |  mapping  |                     +-----------+                         | |
- * |           |                     |     6     | ------------------------+ |
- * |           |                     +-----------+                           |
- * |           |                     |     7     | --------------------------+
+ * |           |                     |     1     | ---------------^ ^ ^ ^ ^ ^ ^
+ * |           |                     +-----------+                  | | | | | |
+ * |           |                     |     2     | -----------------+ | | | | |
+ * |           |                     +-----------+                    | | | | |
+ * |           |                     |     3     | -------------------+ | | | |
+ * |           |                     +-----------+                      | | | |
+ * |           |                     |     4     | ---------------------+ | | |
+ * |    PMD    |                     +-----------+                        | | |
+ * |   level   |                     |     5     | -----------------------+ | |
+ * |  mapping  |                     +-----------+                          | |
+ * |           |                     |     6     | -------------------------+ |
+ * |           |                     +-----------+                            |
+ * |           |                     |     7     | ---------------------------+
  * |           |                     +-----------+
  * |           |
  * |           |
  * |           |
  * +-----------+
  *
- * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
  * vmemmap pages and restore the previous mapping relationship.
  *
  * For the HugeTLB page of the pud level mapping. It is similar to the former.
- * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
+ * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
  *
  * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
  * (e.g. aarch64) provides a contiguous bit in the translation table entries
@@ -166,7 +166,13 @@
  *
  * The contiguous bit is used to increase the mapping size at the pmd and pte
  * (last) level. So this type of HugeTLB page can be optimized only when its
- * size of the struct page structs is greater than 2 pages.
+ * size of the struct page structs is greater than 1 page.
+ *
+ * Notice: The head vmemmap page is not freed to the buddy allocator and all
+ * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
+ * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
+ * associated with each HugeTLB page. The compound_head() can handle this
+ * correctly (more details refer to the comment above compound_head()).
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
@@ -175,19 +181,21 @@
 /*
  * There are a lot of struct page structures associated with each HugeTLB page.
  * For tail pages, the value of compound_head is the same. So we can reuse first
- * page of tail page structures. We map the virtual addresses of the remaining
- * pages of tail page structures to the first tail page struct, and then free
- * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
+ * page of head page structures. We map the virtual addresses of all the pages
+ * of tail page structures to the head page struct, and then free these page
+ * frames. Therefore, we need to reserve one pages as vmemmap areas.
  */
-#define RESERVE_VMEMMAP_NR		2U
+#define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+bool hugetlb_free_vmemmap_enabled __read_mostly =
+	IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled);
 
 static int __init early_hugetlb_free_vmemmap_param(char *buf)
 {
 	/* We cannot optimize if a "struct page" crosses page boundaries. */
-	if ((!is_power_of_2(sizeof(struct page)))) {
+	if (!is_power_of_2(sizeof(struct page))) {
 		pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
 		return 0;
 	}
@@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
 	 */
 	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
 				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
-
 	if (!ret)
 		ClearHPageVmemmapOptimized(head);
 
@@ -282,9 +289,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 
 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
 	/*
-	 * The head page and the first tail page are not to be freed to buddy
-	 * allocator, the other pages will map to the first tail page, so they
-	 * can be freed.
+	 * The head page is not to be freed to buddy allocator, the other tail
+	 * pages will map to the head page, so they can be freed.
 	 *
 	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
 	 * on some architectures (e.g. aarch64). See Documentation/arm64/
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 396a49462894..39bdbf0b28dd 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -245,6 +245,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 	set_pte_at(&init_mm, addr, pte, entry);
 }
 
+/*
+ * How many struct page structs need to be reset. When we reuse the head
+ * struct page, the special metadata (e.g. page->flags or page->mapping)
+ * cannot copy to the tail struct page structs. The invalid value will be
+ * checked in the free_tail_pages_check(). In order to avoid the message
+ * of "corrupted mapping in tail page". We need to reset at least 3 (one
+ * head struct page struct and two tail struct page structs) struct page
+ * structs.
+ */
+#define NR_RESET_STRUCT_PAGE		3
+
+static inline void reset_struct_pages(struct page *start)
+{
+	int i;
+	struct page *from = start + NR_RESET_STRUCT_PAGE;
+
+	for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
+		memcpy(start + i, from, sizeof(*from));
+}
+
 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 				struct vmemmap_remap_walk *walk)
 {
@@ -258,6 +278,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 	list_del(&page->lru);
 	to = page_to_virt(page);
 	copy_page(to, (void *)walk->reuse_addr);
+	reset_struct_pages(to);
 
 	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 }
-- 
Gitee


From 9b4aa808afd36ebad9567413eddf5a0db8b8441f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:47 +0800
Subject: [PATCH 114/132] mm: hugetlb: replace hugetlb_free_vmemmap_enabled
 with a static_key

mainline inclusion
from mainline-v5.18-rc1
commit a6b40850c442bf996e729e1d441d3dbc37cea171
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a6b40850c442bf996e729e1d441d3dbc37cea171

--------------------------------

The page_fixed_fake_head() is used throughout memory management and the
conditional check requires checking a global variable, although the
overhead of this check may be small, it increases when the memory cache
comes under pressure.  Also, the global variable will not be modified
after system boot, so it is very appropriate to use static key machanism.

Link: https://lkml.kernel.org/r/20211101031651.75851-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	mm/memory_hotplug.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/hugetlb.h    |  6 ------
 include/linux/page-flags.h | 16 ++++++++++++++--
 mm/hugetlb_vmemmap.c       | 12 ++++++------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 634630ebc8a7..fd1dc8d29436 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1103,12 +1103,6 @@ static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm,
 }
 #endif	/* CONFIG_HUGETLB_PAGE */
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-extern bool hugetlb_free_vmemmap_enabled;
-#else
-#define hugetlb_free_vmemmap_enabled	false
-#endif
-
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
 					struct mm_struct *mm, pte_t *pte)
 {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 7b31a81be7e1..81061122af68 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -195,7 +195,14 @@ enum pageflags {
 #ifndef __GENERATING_BOUNDS_H
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-extern bool hugetlb_free_vmemmap_enabled;
+DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+			 hugetlb_free_vmemmap_enabled_key);
+
+static __always_inline bool hugetlb_free_vmemmap_enabled(void)
+{
+	return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+				   &hugetlb_free_vmemmap_enabled_key);
+}
 
 /*
  * If the feature of freeing some vmemmap pages associated with each HugeTLB
@@ -215,7 +222,7 @@ extern bool hugetlb_free_vmemmap_enabled;
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-	if (!hugetlb_free_vmemmap_enabled)
+	if (!hugetlb_free_vmemmap_enabled())
 		return page;
 
 	/*
@@ -243,6 +250,11 @@ static inline const struct page *page_fixed_fake_head(const struct page *page)
 {
 	return page;
 }
+
+static inline bool hugetlb_free_vmemmap_enabled(void)
+{
+	return false;
+}
 #endif
 
 static __always_inline int page_is_fake_head(struct page *page)
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4977f5a520c2..791626983c2e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -188,9 +188,9 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-bool hugetlb_free_vmemmap_enabled __read_mostly =
-	IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
-EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+			hugetlb_free_vmemmap_enabled_key);
+EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
 
 static int __init early_hugetlb_free_vmemmap_param(char *buf)
 {
@@ -204,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf)
 		return -EINVAL;
 
 	if (!strcmp(buf, "on"))
-		hugetlb_free_vmemmap_enabled = true;
+		static_branch_enable(&hugetlb_free_vmemmap_enabled_key);
 	else if (!strcmp(buf, "off"))
-		hugetlb_free_vmemmap_enabled = false;
+		static_branch_disable(&hugetlb_free_vmemmap_enabled_key);
 	else
 		return -EINVAL;
 
@@ -284,7 +284,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
 
-	if (!hugetlb_free_vmemmap_enabled)
+	if (!hugetlb_free_vmemmap_enabled())
 		return;
 
 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
-- 
Gitee


From b31b84bf5c6d282b2e3bdcf8dc6df4f395efcb19 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:48 +0800
Subject: [PATCH 115/132] mm: sparsemem: use page table lock to protect kernel
 pmd operations

mainline inclusion
from mainline-v5.18-rc1
commit d8d55f5616cf3b900a23a72dd24e7b07211e7859
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8d55f5616cf3b900a23a72dd24e7b07211e7859

--------------------------------

The init_mm.page_table_lock is used to protect kernel page tables, we
can use it to serialize splitting vmemmap PMD mappings instead of mmap
write lock, which can increase the concurrency of vmemmap_remap_free().

Actually, It increase the concurrency between allocations of HugeTLB
pages.  But it is not the only benefit.  There are a lot of users of
mmap read lock of init_mm.  The mmap write lock is holding through
vmemmap_remap_free(), removing mmap write lock usage to make it does not
affect other users of mmap read lock.  It is not making anything worse
and always a win to move.

Now the kernel page table walker does not hold the page_table_lock when
walking pmd entries.  There may be consistency issue of a pmd entry,
because pmd entry might change from a huge pmd entry to a PTE page
table.  There is only one user of kernel page table walker, namely
ptdump.  The ptdump already considers the consistency, which use a local
variable to cache the value of pmd entry.  But we also need to update
->action to ACTION_CONTINUE to make sure the walker does not walk every
pte entry again when concurrent thread has split the huge pmd.

Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	mm/sparse-vmemmap.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/ptdump.c         | 16 +++++++++++----
 mm/sparse-vmemmap.c | 47 ++++++++++++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 93f2f63dc52d..43661863096b 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -39,8 +39,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
 	if (st->effective_prot)
 		st->effective_prot(st, 0, pgd_val(val));
 
-	if (pgd_leaf(val))
+	if (pgd_leaf(val)) {
 		st->note_page(st, addr, 0, pgd_val(val));
+		walk->action = ACTION_CONTINUE;
+	}
 
 	return 0;
 }
@@ -59,8 +61,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
 	if (st->effective_prot)
 		st->effective_prot(st, 1, p4d_val(val));
 
-	if (p4d_leaf(val))
+	if (p4d_leaf(val)) {
 		st->note_page(st, addr, 1, p4d_val(val));
+		walk->action = ACTION_CONTINUE;
+	}
 
 	return 0;
 }
@@ -79,8 +83,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
 	if (st->effective_prot)
 		st->effective_prot(st, 2, pud_val(val));
 
-	if (pud_leaf(val))
+	if (pud_leaf(val)) {
 		st->note_page(st, addr, 2, pud_val(val));
+		walk->action = ACTION_CONTINUE;
+	}
 
 	return 0;
 }
@@ -98,8 +104,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
 
 	if (st->effective_prot)
 		st->effective_prot(st, 3, pmd_val(val));
-	if (pmd_leaf(val))
+	if (pmd_leaf(val)) {
 		st->note_page(st, addr, 3, pmd_val(val));
+		walk->action = ACTION_CONTINUE;
+	}
 
 	return 0;
 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 39bdbf0b28dd..eef9053f948f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -53,8 +53,7 @@ struct vmemmap_remap_walk {
 	struct list_head *vmemmap_pages;
 };
 
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
-				  struct vmemmap_remap_walk *walk)
+static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
 {
 	pmd_t __pmd;
 	int i;
@@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
 
-	/* Make pte visible before pmd. See comment in __pte_alloc(). */
-	smp_wmb();
-	pmd_populate_kernel(&init_mm, pmd, pgtable);
-
-	flush_tlb_kernel_range(start, start + PMD_SIZE);
+	spin_lock(&init_mm.page_table_lock);
+	if (likely(pmd_leaf(*pmd))) {
+		/* Make pte visible before pmd. See comment in __pte_alloc(). */
+		smp_wmb();
+		pmd_populate_kernel(&init_mm, pmd, pgtable);
+		flush_tlb_kernel_range(start, start + PMD_SIZE);
+	} else {
+		pte_free_kernel(&init_mm, pgtable);
+	}
+	spin_unlock(&init_mm.page_table_lock);
 
 	return 0;
 }
 
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+	int leaf;
+
+	spin_lock(&init_mm.page_table_lock);
+	leaf = pmd_leaf(*pmd);
+	spin_unlock(&init_mm.page_table_lock);
+
+	if (!leaf)
+		return 0;
+
+	return __split_vmemmap_huge_pmd(pmd, start);
+}
+
 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 			      unsigned long end,
 			      struct vmemmap_remap_walk *walk)
@@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
 
 	pmd = pmd_offset(pud, addr);
 	do {
-		if (pmd_leaf(*pmd)) {
-			int ret;
+		int ret;
+
+		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+		if (ret)
+			return ret;
 
-			ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
-			if (ret)
-				return ret;
-		}
 		next = pmd_addr_end(addr, end);
 		vmemmap_pte_range(pmd, addr, next, walk);
 	} while (pmd++, addr = next, addr != end);
@@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end,
 	 */
 	BUG_ON(start - reuse != PAGE_SIZE);
 
-	mmap_write_lock(&init_mm);
+	mmap_read_lock(&init_mm);
 	ret = vmemmap_remap_range(reuse, end, &walk);
-	mmap_write_downgrade(&init_mm);
-
 	if (ret && walk.nr_walked) {
 		end = reuse + walk.nr_walked * PAGE_SIZE;
 		/*
-- 
Gitee


From 7d9a273f5bc8369c07ac2ba082984cb1d1e9b182 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:49 +0800
Subject: [PATCH 116/132] selftests: vm: add a hugetlb test case

mainline inclusion
from mainline-v5.18-rc1
commit b147c89cd429321a59147368378c8aba17c8480f
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b147c89cd429321a59147368378c8aba17c8480f

--------------------------------

Since the head vmemmap page frame associated with each HugeTLB page is
reused, we should hide the PG_head flag of tail struct page from the
user.  Add a tese case to check whether it is work properly.  The test
steps are as follows.

  1) alloc 2MB hugeTLB
  2) get each page frame
  3) apply those APIs in each page frame
  4) Those APIs work completely the same as before.

Reading the flags of a page by /proc/kpageflags is done in
stable_page_flags(), which has invoked PageHead(), PageTail(),
PageCompound() and compound_head().

If those APIs work properly, the head page must have 15 and 17 bits set.
And tail pages must have 16 and 17 bits set but 15 bit unset.  Those
flags are checked in check_page_flags().

Link: https://lkml.kernel.org/r/20211101031651.75851-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Conflicts:
	tools/testing/selftests/vm/Makefile
	tools/testing/selftests/vm/run_vmtests.sh
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 tools/testing/selftests/vm/.gitignore         |   1 +
 tools/testing/selftests/vm/Makefile           |   1 +
 tools/testing/selftests/vm/hugepage-vmemmap.c | 144 ++++++++++++++++++
 tools/testing/selftests/vm/run_vmtests        |  11 ++
 4 files changed, 157 insertions(+)
 create mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c

diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index b3a183c36cb5..905dc4efa879 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 hugepage-mmap
 hugepage-shm
+hugepage-vmemmap
 khugepaged
 map_hugetlb
 map_populate
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index b150cc837177..549761bc7193 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -27,6 +27,7 @@ TEST_GEN_FILES += gup_benchmark
 TEST_GEN_FILES += hmm-tests
 TEST_GEN_FILES += hugepage-mmap
 TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += hugepage-vmemmap
 TEST_GEN_FILES += map_hugetlb
 TEST_GEN_FILES += map_fixed_noreplace
 TEST_GEN_FILES += map_populate
diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c
new file mode 100644
index 000000000000..557bdbd4f87e
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-vmemmap.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case of using hugepage memory in a user application using the
+ * mmap system call with MAP_HUGETLB flag.  Before running this program
+ * make sure the administrator has allocated enough default sized huge
+ * pages to cover the 2 MB allocation.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define MAP_LENGTH		(2UL * 1024 * 1024)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB		0x40000	/* arch specific */
+#endif
+
+#define PAGE_SIZE		4096
+
+#define PAGE_COMPOUND_HEAD	(1UL << 15)
+#define PAGE_COMPOUND_TAIL	(1UL << 16)
+#define PAGE_HUGE		(1UL << 17)
+
+#define HEAD_PAGE_FLAGS		(PAGE_COMPOUND_HEAD | PAGE_HUGE)
+#define TAIL_PAGE_FLAGS		(PAGE_COMPOUND_TAIL | PAGE_HUGE)
+
+#define PM_PFRAME_BITS		55
+#define PM_PFRAME_MASK		~((1UL << PM_PFRAME_BITS) - 1)
+
+/*
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#ifdef __ia64__
+#define MAP_ADDR		(void *)(0x8000000000000000UL)
+#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define MAP_ADDR		NULL
+#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void write_bytes(char *addr, size_t length)
+{
+	unsigned long i;
+
+	for (i = 0; i < length; i++)
+		*(addr + i) = (char)i;
+}
+
+static unsigned long virt_to_pfn(void *addr)
+{
+	int fd;
+	unsigned long pagemap;
+
+	fd = open("/proc/self/pagemap", O_RDONLY);
+	if (fd < 0)
+		return -1UL;
+
+	lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET);
+	read(fd, &pagemap, sizeof(pagemap));
+	close(fd);
+
+	return pagemap & ~PM_PFRAME_MASK;
+}
+
+static int check_page_flags(unsigned long pfn)
+{
+	int fd, i;
+	unsigned long pageflags;
+
+	fd = open("/proc/kpageflags", O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	lseek(fd, pfn * sizeof(pageflags), SEEK_SET);
+
+	read(fd, &pageflags, sizeof(pageflags));
+	if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
+		close(fd);
+		printf("Head page flags (%lx) is invalid\n", pageflags);
+		return -1;
+	}
+
+	/*
+	 * pages other than the first page must be tail and shouldn't be head;
+	 * this also verifies kernel has correctly set the fake page_head to tail
+	 * while hugetlb_free_vmemmap is enabled.
+	 */
+	for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) {
+		read(fd, &pageflags, sizeof(pageflags));
+		if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
+		    (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
+			close(fd);
+			printf("Tail page flags (%lx) is invalid\n", pageflags);
+			return -1;
+		}
+	}
+
+	close(fd);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	void *addr;
+	unsigned long pfn;
+
+	addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	/* Trigger allocation of HugeTLB page. */
+	write_bytes(addr, MAP_LENGTH);
+
+	pfn = virt_to_pfn(addr);
+	if (pfn == -1UL) {
+		munmap(addr, MAP_LENGTH);
+		perror("virt_to_pfn");
+		exit(1);
+	}
+
+	printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
+
+	if (check_page_flags(pfn) < 0) {
+		munmap(addr, MAP_LENGTH);
+		perror("check_page_flags");
+		exit(1);
+	}
+
+	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+	if (munmap(addr, MAP_LENGTH)) {
+		perror("munmap");
+		exit(1);
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index d578ad831813..949c71fe5951 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -108,6 +108,17 @@ else
 	echo "[PASS]"
 fi
 
+echo "------------------------"
+echo "running hugepage-vmemmap"
+echo "------------------------"
+./hugepage-vmemmap
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
 echo "NOTE: The above hugetlb tests provide minimal coverage.  Use"
 echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
 echo "      hugetlb regression testing."
-- 
Gitee


From 22f969cffb0be543692d6942f41cf3735fb66790 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:50 +0800
Subject: [PATCH 117/132] mm: sparsemem: move vmemmap related to HugeTLB to
 CONFIG_HUGETLB_PAGE_FREE_VMEMMAP

mainline inclusion
from mainline-v5.18-rc1
commit e54084173487804f5e2f23facf107fd9336e637e
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e54084173487804f5e2f23facf107fd9336e637e

--------------------------------

The vmemmap_remap_free/alloc are relevant to HugeTLB, so move those
functiongs to the scope of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP.

Link: https://lkml.kernel.org/r/20211101031651.75851-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/mm.h  | 2 ++
 mm/sparse-vmemmap.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 25891b581bf4..efa3972a23bf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3104,10 +3104,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 int vmemmap_remap_free(unsigned long start, unsigned long end,
 		       unsigned long reuse);
 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 			unsigned long reuse, gfp_t gfp_mask);
+#endif
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index eef9053f948f..e5ed2680ec57 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -34,6 +34,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
  *
@@ -419,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 
 	return 0;
 }
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
 
 /*
  * Allocate a block of memory to be used to back the virtual memory map
-- 
Gitee


From 0a5d68bb279164b7ec023203dc9f0d6a59fc17f1 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Thu, 28 Jul 2022 18:06:51 +0800
Subject: [PATCH 118/132] Revert "arm64: mm: hugetlb: add support for free
 vmemmap pages of HugeTLB"

hulk inclusion
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO

--------------------------------

There is a formal solution to support hugetlb vmemmap feature on arm64.

This reverts commit 5838d235a395e221968e1f024bfaf8650278a522.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 6e723c90a506..37a8895a3254 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -239,7 +239,7 @@ config HUGETLB_PAGE
 
 config HUGETLB_PAGE_FREE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on X86_64 || ARM64
+	depends on X86_64
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
-- 
Gitee


From e457e43c459e8ecc1679f5f356f11c84ada79bbf Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:52 +0800
Subject: [PATCH 119/132] mm: hugetlb_vmemmap: introduce
 ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP

mainline inclusion
from mainline-v5.19-rc1
commit 2e4ec02bbcc05b8905d65c763ebde6bc85508e90
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2e4ec02bbcc05b8905d65c763ebde6bc85508e90

--------------------------------

The feature of minimizing overhead of struct page associated with each
HugeTLB page is implemented on x86_64, however, the infrastructure of this
feature is already there, we could easily enable it for other
architectures.  Introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP for other
architectures to be easily enabled.  Just select this config if they want
to enable this feature.

Link: https://lkml.kernel.org/r/20220331065640.5777-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Barry Song <baohua@kernel.org>
Tested-by: Barry Song <baohua@kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/x86/Kconfig |  1 +
 fs/Kconfig       | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d17396ef4323..661e05e1f762 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -103,6 +103,7 @@ config X86
 	select ARCH_WANT_DEFAULT_BPF_JIT	if X86_64
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_HUGE_PMD_SHARE
+	select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP	if X86_64
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select BUILDTIME_TABLE_SORT
diff --git a/fs/Kconfig b/fs/Kconfig
index 37a8895a3254..b60a7614cb16 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -237,9 +237,17 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of minimizing overhead of struct page associated with
+# each HugeTLB page.
+#
+config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
+	bool
+
 config HUGETLB_PAGE_FREE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on X86_64
+	depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
-- 
Gitee


From de126e063c1e89c698fe2304d0e590f69157ac66 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:53 +0800
Subject: [PATCH 120/132] arm64: mm: hugetlb: enable HUGETLB_PAGE_FREE_VMEMMAP
 for arm64

mainline inclusion
from mainline-v5.19-rc1
commit 1e63ac088f20f7a4425c430c31ecd3cf167fb3f2
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1e63ac088f20f7a4425c430c31ecd3cf167fb3f2

--------------------------------

The feature of minimizing overhead of struct page associated with each
HugeTLB page aims to free its vmemmap pages (used as struct page) to save
memory, where is ~14GB/16GB per 1TB HugeTLB pages (2MB/1GB type).  In
short, when a HugeTLB page is allocated or freed, the vmemmap array
representing the range associated with the page will need to be remapped.
When a page is allocated, vmemmap pages are freed after remapping.  When a
page is freed, previously discarded vmemmap pages must be allocated before
remapping.  More implementations and details can be found here [1].

The infrastructure of freeing vmemmap pages associated with each HugeTLB
page is already there, we can easily enable HUGETLB_PAGE_FREE_VMEMMAP for
arm64, the only thing to be fixed is flush_dcache_page() .

flush_dcache_page() need to be adapted to operate on the head page's flags
since the tail vmemmap pages are mapped with read-only after the feature
is enabled (clear operation is not permitted).

There was some discussions about this in the thread [2], but there was no
conclusion in the end.  And I copied the concern proposed by Anshuman to
here and explain why those concern is superfluous.  It is safe to enable
it for x86_64 as well as arm64.

1st concern:
'''
But what happens when a hot remove section's vmemmap area (which is
being teared down) is nearby another vmemmap area which is either created
or being destroyed for HugeTLB alloc/free purpose. As you mentioned
HugeTLB pages inside the hot remove section might be safe. But what about
other HugeTLB areas whose vmemmap area shares page table entries with
vmemmap entries for a section being hot removed ? Massive HugeTLB alloc
/use/free test cycle using memory just adjacent to a memory hotplug area,
which is always added and removed periodically, should be able to expose
this problem.
'''

Answer: At the time memory is removed, all HugeTLB pages either have been
migrated away or dissolved.  So there is no race between memory hot remove
and free_huge_page_vmemmap().  Therefore, HugeTLB pages inside the hot
remove section is safe.  Let's talk your question "what about other
HugeTLB areas whose vmemmap area shares page table entries with vmemmap
entries for a section being hot removed ?", the question is not
established.  The minimal granularity size of hotplug memory 128MB (on
arm64, 4k base page), any HugeTLB smaller than 128MB is within a section,
then, there is no share PTE page tables between HugeTLB in this section
and ones in other sections and a HugeTLB page could not cross two
sections.  In this case, the section cannot be freed.  Any HugeTLB bigger
than 128MB (section size) whose vmemmap pages is an integer multiple of
2MB (PMD-mapped).  As long as:

  1) HugeTLBs are naturally aligned, power-of-two sizes
  2) The HugeTLB size >= the section size
  3) The HugeTLB size >= the vmemmap leaf mapping size

Then a HugeTLB will not share any leaf page table entries with *anything
else*, but will share intermediate entries.  In this case, at the time
memory is removed, all HugeTLB pages either have been migrated away or
dissolved.  So there is also no race between memory hot remove and
free_huge_page_vmemmap().

2nd concern:
'''
differently, not sure if ptdump would require any synchronization.

Dumping an wrong value is probably okay but crashing because a page table
entry is being freed after ptdump acquired the pointer is bad. On arm64,
ptdump() is protected against hotremove via [get|put]_online_mems().
'''

Answer: The ptdump should be fine since vmemmap_remap_free() only
exchanges PTEs or splits the PMD entry (which means allocating a PTE page
table).  Both operations do not free any page tables (PTE), so ptdump
cannot run into a UAF on any page tables.  The worst case is just dumping
an wrong value.

[1] https://lore.kernel.org/all/20210510030027.56044-1-songmuchun@bytedance.com/
[2] https://lore.kernel.org/all/20210518091826.36937-1-songmuchun@bytedance.com/

[songmuchun@bytedance.com: restructure the code comment inside flush_dcache_page()]
  Link: https://lkml.kernel.org/r/20220414072646.21910-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220331065640.5777-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Tested-by: Barry Song <baohua@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/arm64/Kconfig    |  1 +
 arch/arm64/mm/flush.c | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e253fdba1249..d9da5c4f91e0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -81,6 +81,7 @@ config ARM64
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
+	select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 145fe60c5e68..2bb6defad92f 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
  */
 void flush_dcache_page(struct page *page)
 {
+	/*
+	 * Only the head page's flags of HugeTLB can be cleared since the tail
+	 * vmemmap pages associated with each HugeTLB page are mapped with
+	 * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more
+	 * details can refer to vmemmap_remap_pte()).  Although
+	 * __sync_icache_dcache() only set PG_dcache_clean flag on the head
+	 * page struct, there is more than one page struct with PG_dcache_clean
+	 * associated with the HugeTLB page since the head vmemmap page frame
+	 * is reused (more details can refer to the comments above
+	 * page_fixed_fake_head()).
+	 */
+	if (hugetlb_free_vmemmap_enabled() && PageHuge(page))
+		page = compound_head(page);
+
 	if (test_bit(PG_dcache_clean, &page->flags))
 		clear_bit(PG_dcache_clean, &page->flags);
 }
-- 
Gitee


From b8dfc94bd51821c5467f622ffec51c5bfd37c342 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:54 +0800
Subject: [PATCH 121/132] mm: hugetlb_vmemmap: cleanup hugetlb_vmemmap related
 functions

mainline inclusion
from mainline-v5.19-rc1
commit 5981611d0a006472d367d7a8e6ead8afaecf17c7
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5981611d0a006472d367d7a8e6ead8afaecf17c7

--------------------------------

Patch series "cleanup hugetlb_vmemmap".

The word of "free" is not expressive enough to express the feature of
optimizing vmemmap pages associated with each HugeTLB, rename this keywork
to "optimize" is more clear.  In this series, cheanup related codes to
make it more clear and expressive.  This is suggested by David.

This patch (of 3):

The word of "free" is not expressive enough to express the feature of
optimizing vmemmap pages associated with each HugeTLB, rename this keywork
to "optimize".  And some function names are prefixed with "huge_page"
instead of "hugetlb", it is easily to be confused with THP.  In this
patch, cheanup related functions to make code more clear and expressive.

Link: https://lkml.kernel.org/r/20220404074652.68024-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220404074652.68024-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/hugetlb.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 include/linux/hugetlb.h |  2 +-
 mm/hugetlb.c            |  8 ++++----
 mm/hugetlb_vmemmap.c    | 42 ++++++++++++++++++++---------------------
 mm/hugetlb_vmemmap.h    | 20 ++++++++++----------
 4 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fd1dc8d29436..218fc150d7f8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -593,7 +593,7 @@ struct hstate {
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 	unsigned int resv_huge_pages_node[MAX_NUMNODES];
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-	unsigned int nr_free_vmemmap_pages;
+	unsigned int optimize_vmemmap_pages;
 #endif
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8c815386ecc..c5168c7f282a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1448,7 +1448,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
 		return;
 
-	if (alloc_huge_page_vmemmap(h, page)) {
+	if (hugetlb_vmemmap_alloc(h, page)) {
 		spin_lock_irq(&hugetlb_lock);
 		/*
 		 * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1519,7 +1519,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
 
 static inline void flush_free_hpage_work(struct hstate *h)
 {
-	if (free_vmemmap_pages_per_hpage(h))
+	if (hugetlb_optimize_vmemmap_pages(h))
 		flush_work(&free_hpage_work);
 }
 
@@ -1642,7 +1642,7 @@ void free_huge_page(struct page *page)
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
-	free_huge_page_vmemmap(h, page);
+	hugetlb_vmemmap_free(h, page);
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 	hugetlb_set_page_subpool(page, NULL);
@@ -2066,7 +2066,7 @@ int dissolve_free_huge_page(struct page *page)
 		 * Attempt to allocate vmemmmap here so that we can take
 		 * appropriate action on failure.
 		 */
-		rc = alloc_huge_page_vmemmap(h, head);
+		rc = hugetlb_vmemmap_alloc(h, head);
 		if (!rc) {
 			/*
 			 * Move PageHWPoison flag from head page to the raw
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 791626983c2e..91b79b9d9e25 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Free some vmemmap pages of HugeTLB
+ * Optimize vmemmap pages associated with HugeTLB
  *
  * Copyright (c) 2020, Bytedance. All rights reserved.
  *
@@ -192,7 +192,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
 			hugetlb_free_vmemmap_enabled_key);
 EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
 
-static int __init early_hugetlb_free_vmemmap_param(char *buf)
+static int __init hugetlb_vmemmap_early_param(char *buf)
 {
 	/* We cannot optimize if a "struct page" crosses page boundaries. */
 	if (!is_power_of_2(sizeof(struct page))) {
@@ -212,29 +212,26 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf)
 
 	return 0;
 }
-early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param);
-
-static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
-{
-	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
-}
+early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
 
 /*
  * Previously discarded vmemmap pages will be allocated and remapping
  * after this function returns zero.
  */
-int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 {
 	int ret;
 	unsigned long vmemmap_addr = (unsigned long)head;
-	unsigned long vmemmap_end, vmemmap_reuse;
+	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
 
 	if (!HPageVmemmapOptimized(head))
 		return 0;
 
-	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
-	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
-	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
+	vmemmap_pages	= hugetlb_optimize_vmemmap_pages(h);
+	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
+	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
+
 	/*
 	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
 	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
@@ -250,17 +247,18 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
 	return ret;
 }
 
-void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 {
 	unsigned long vmemmap_addr = (unsigned long)head;
-	unsigned long vmemmap_end, vmemmap_reuse;
+	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
 
-	if (!free_vmemmap_pages_per_hpage(h))
+	vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+	if (!vmemmap_pages)
 		return;
 
-	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
-	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
-	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
+	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
+	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
 
 	/*
 	 * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
@@ -297,8 +295,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	 * hugetlbpage.rst for more details.
 	 */
 	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
-		h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+		h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
 
-	pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
-		h->name);
+	pr_info("can optimize %d vmemmap pages for %s\n",
+		h->optimize_vmemmap_pages, h->name);
 }
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index cb2bef8f9e73..9760537849b5 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Free some vmemmap pages of HugeTLB
+ * Optimize vmemmap pages associated with HugeTLB
  *
  * Copyright (c) 2020, Bytedance. All rights reserved.
  *
@@ -11,25 +11,25 @@
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
-void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
+void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
 void hugetlb_vmemmap_init(struct hstate *h);
 
 /*
- * How many vmemmap pages associated with a HugeTLB page that can be freed
- * to the buddy allocator.
+ * How many vmemmap pages associated with a HugeTLB page that can be
+ * optimized and freed to the buddy allocator.
  */
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
 {
-	return h->nr_free_vmemmap_pages;
+	return h->optimize_vmemmap_pages;
 }
 #else
-static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 {
 	return 0;
 }
 
-static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 {
 }
 
@@ -37,7 +37,7 @@ static inline void hugetlb_vmemmap_init(struct hstate *h)
 {
 }
 
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
 {
 	return 0;
 }
-- 
Gitee


From 142ca734a4a1aab74f8c37818b634f996ef49e08 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:55 +0800
Subject: [PATCH 122/132] mm: hugetlb_vmemmap: cleanup
 hugetlb_free_vmemmap_enabled*

mainline inclusion
from mainline-v5.19-rc1
commit f10f1442c309ccef7a80ba3dc4abde0978e86fb4
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f10f1442c309ccef7a80ba3dc4abde0978e86fb4

--------------------------------

The word of "free" is not expressive enough to express the feature of
optimizing vmemmap pages associated with each HugeTLB, rename this keywork
to "optimize".  In this patch , cheanup the static key and
hugetlb_free_vmemmap_enabled() to make code more expressive.

Link: https://lkml.kernel.org/r/20220404074652.68024-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/memory_hotplug.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/arm64/mm/flush.c      |  2 +-
 include/linux/page-flags.h | 12 ++++++------
 mm/hugetlb_vmemmap.c       | 10 +++++-----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 2bb6defad92f..892e53e9c788 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -80,7 +80,7 @@ void flush_dcache_page(struct page *page)
 	 * is reused (more details can refer to the comments above
 	 * page_fixed_fake_head()).
 	 */
-	if (hugetlb_free_vmemmap_enabled() && PageHuge(page))
+	if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page))
 		page = compound_head(page);
 
 	if (test_bit(PG_dcache_clean, &page->flags))
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 81061122af68..d1cf17a71a12 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -196,16 +196,16 @@ enum pageflags {
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
-			 hugetlb_free_vmemmap_enabled_key);
+			 hugetlb_optimize_vmemmap_key);
 
-static __always_inline bool hugetlb_free_vmemmap_enabled(void)
+static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
 {
 	return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
-				   &hugetlb_free_vmemmap_enabled_key);
+				   &hugetlb_optimize_vmemmap_key);
 }
 
 /*
- * If the feature of freeing some vmemmap pages associated with each HugeTLB
+ * If the feature of optimizing vmemmap pages associated with each HugeTLB
  * page is enabled, the head vmemmap page frame is reused and all of the tail
  * vmemmap addresses map to the head vmemmap page frame (furture details can
  * refer to the figure at the head of the mm/hugetlb_vmemmap.c).  In other
@@ -222,7 +222,7 @@ static __always_inline bool hugetlb_free_vmemmap_enabled(void)
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-	if (!hugetlb_free_vmemmap_enabled())
+	if (!hugetlb_optimize_vmemmap_enabled())
 		return page;
 
 	/*
@@ -251,7 +251,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page)
 	return page;
 }
 
-static inline bool hugetlb_free_vmemmap_enabled(void)
+static inline bool hugetlb_optimize_vmemmap_enabled(void)
 {
 	return false;
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 91b79b9d9e25..f25294973398 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -189,8 +189,8 @@
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
 DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
-			hugetlb_free_vmemmap_enabled_key);
-EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
+			hugetlb_optimize_vmemmap_key);
+EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
@@ -204,9 +204,9 @@ static int __init hugetlb_vmemmap_early_param(char *buf)
 		return -EINVAL;
 
 	if (!strcmp(buf, "on"))
-		static_branch_enable(&hugetlb_free_vmemmap_enabled_key);
+		static_branch_enable(&hugetlb_optimize_vmemmap_key);
 	else if (!strcmp(buf, "off"))
-		static_branch_disable(&hugetlb_free_vmemmap_enabled_key);
+		static_branch_disable(&hugetlb_optimize_vmemmap_key);
 	else
 		return -EINVAL;
 
@@ -282,7 +282,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
 
-	if (!hugetlb_free_vmemmap_enabled())
+	if (!hugetlb_optimize_vmemmap_enabled())
 		return;
 
 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
-- 
Gitee


From f1f5face0c06f0526c21122eb601f4e5482d612b Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:56 +0800
Subject: [PATCH 123/132] mm: hugetlb_vmemmap: cleanup
 CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*

mainline inclusion
from mainline-v5.19-rc1
commit 47010c040dec8af6347ec6259104fc13f7e7e30a
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47010c040dec8af6347ec6259104fc13f7e7e30a

--------------------------------

The word of "free" is not expressive enough to express the feature of
optimizing vmemmap pages associated with each HugeTLB, rename this keywork
to "optimize".  In this patch , cheanup configs to make code more
expressive.

Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	arch/arm64/configs/openeuler_defconfig
	arch/x86/configs/openeuler_defconfig
	Documentation/admin-guide/kernel-parameters.txt
	include/linux/hugetlb.h
	mm/Makefile
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 ++++-
 Documentation/admin-guide/mm/hugetlbpage.rst    |  2 +-
 arch/arm64/Kconfig                              |  2 +-
 arch/arm64/configs/openeuler_defconfig          |  4 ++--
 arch/arm64/mm/flush.c                           |  2 +-
 arch/x86/Kconfig                                |  2 +-
 arch/x86/configs/openeuler_defconfig            |  4 ++--
 arch/x86/mm/init_64.c                           |  2 +-
 fs/Kconfig                                      | 16 ++++++++--------
 include/linux/hugetlb.h                         |  2 +-
 include/linux/mm.h                              |  2 +-
 include/linux/page-flags.h                      |  6 +++---
 mm/Makefile                                     |  2 +-
 mm/hugetlb_vmemmap.c                            |  4 ++--
 mm/hugetlb_vmemmap.h                            |  4 ++--
 mm/sparse-vmemmap.c                             |  4 ++--
 16 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e0957b73f63d..86a7ea4a9964 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1593,7 +1593,7 @@
 			Format: size[KMG]
 
 	hugetlb_free_vmemmap=
-			[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+			[KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 			enabled.
 			Allows heavy hugetlb users to free up some more
 			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
@@ -1602,6 +1602,9 @@
 			on:  enable the feature
 			off: disable the feature
 
+			Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
+			the default is on.
+
 	hugevmalloc	[KNL,PPC,ARM64,X86] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC
 			Format: { on | off }
 			Default set by CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED.
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index d70828c07658..0f8acc4a6cf0 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -164,7 +164,7 @@ default_hugepagesz
 	will all result in 256 2M huge pages being allocated.  Valid default
 	huge page size is architecture dependent.
 hugetlb_free_vmemmap
-	When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing
+	When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing
 	unused vmemmap pages associated with each HugeTLB page.
 
 When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d9da5c4f91e0..c4f6c80ea976 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -81,7 +81,7 @@ config ARM64
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
-	select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
+	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index d246fd508ef6..593ac67497e3 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -6281,8 +6281,8 @@ CONFIG_TMPFS_XATTR=y
 # CONFIG_TMPFS_INODE64 is not set
 CONFIG_HUGETLBFS=y
 CONFIG_HUGETLB_PAGE=y
-CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y
-# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set
+CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y
+# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set
 CONFIG_MEMFD_CREATE=y
 CONFIG_ARCH_HAS_GIGANTIC_PAGE=y
 CONFIG_CONFIGFS_FS=y
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 892e53e9c788..c7678e7df53a 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -72,7 +72,7 @@ void flush_dcache_page(struct page *page)
 	/*
 	 * Only the head page's flags of HugeTLB can be cleared since the tail
 	 * vmemmap pages associated with each HugeTLB page are mapped with
-	 * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more
+	 * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more
 	 * details can refer to vmemmap_remap_pte()).  Although
 	 * __sync_icache_dcache() only set PG_dcache_clean flag on the head
 	 * page struct, there is more than one page struct with PG_dcache_clean
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 661e05e1f762..f39cfb5a6535 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -103,7 +103,7 @@ config X86
 	select ARCH_WANT_DEFAULT_BPF_JIT	if X86_64
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_HUGE_PMD_SHARE
-	select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP	if X86_64
+	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP	if X86_64
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index 3eac70518e6f..f013c7b95881 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -7370,8 +7370,8 @@ CONFIG_TMPFS_XATTR=y
 # CONFIG_TMPFS_INODE64 is not set
 CONFIG_HUGETLBFS=y
 CONFIG_HUGETLB_PAGE=y
-CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y
-# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set
+CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y
+# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set
 CONFIG_DYNAMIC_HUGETLB=y
 CONFIG_MEMFD_CREATE=y
 CONFIG_ARCH_HAS_GIGANTIC_PAGE=y
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1eed5eaee41f..b1d5c05aeca8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1225,7 +1225,7 @@ static struct kcore_list kcore_vsyscall;
 
 static void __init register_page_bootmem_info(void)
 {
-#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)
+#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)
 	int i;
 
 	for_each_online_node(i)
diff --git a/fs/Kconfig b/fs/Kconfig
index b60a7614cb16..aa097ca64ef6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -242,22 +242,22 @@ config HUGETLB_PAGE
 # to enable the feature of minimizing overhead of struct page associated with
 # each HugeTLB page.
 #
-config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
+config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	bool
 
-config HUGETLB_PAGE_FREE_VMEMMAP
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
-	depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP
+	depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
-config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
-	bool "Default freeing vmemmap pages of HugeTLB to on"
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
+	bool "Default optimizing vmemmap pages of HugeTLB to on"
 	default n
-	depends on HUGETLB_PAGE_FREE_VMEMMAP
+	depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	help
-	  When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
+	  When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
 	  pages associated with each HugeTLB page is default off. Say Y here
-	  to enable freeing vmemmap pages of HugeTLB by default. It can then
+	  to enable optimizing vmemmap pages of HugeTLB by default. It can then
 	  be disabled on the command line via hugetlb_free_vmemmap=off.
 
 config DYNAMIC_HUGETLB
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 218fc150d7f8..0dfe08439095 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -592,7 +592,7 @@ struct hstate {
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 	unsigned int resv_huge_pages_node[MAX_NUMNODES];
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	unsigned int optimize_vmemmap_pages;
 #endif
 #ifdef CONFIG_CGROUP_HUGETLB
diff --git a/include/linux/mm.h b/include/linux/mm.h
index efa3972a23bf..1ae73cc4b806 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3104,7 +3104,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 int vmemmap_remap_free(unsigned long start, unsigned long end,
 		       unsigned long reuse);
 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d1cf17a71a12..26b36cac9307 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -194,13 +194,13 @@ enum pageflags {
 
 #ifndef __GENERATING_BOUNDS_H
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 			 hugetlb_optimize_vmemmap_key);
 
 static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
 {
-	return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+	return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 				   &hugetlb_optimize_vmemmap_key);
 }
 
diff --git a/mm/Makefile b/mm/Makefile
index d2a6a786f915..e83233177c7a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
-obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)	+= hugetlb_vmemmap.o
+obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)	+= hugetlb_vmemmap.o
 obj-$(CONFIG_DYNAMIC_HUGETLB) += dynamic_hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index f25294973398..2655434a946b 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -188,7 +188,7 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 			hugetlb_optimize_vmemmap_key);
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
@@ -276,7 +276,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 
 	/*
 	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
-	 * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
+	 * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
 	 * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
 	 */
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 9760537849b5..109b0a53b6fe 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -10,7 +10,7 @@
 #define _LINUX_HUGETLB_VMEMMAP_H
 #include <linux/hugetlb.h>
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
 void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
 void hugetlb_vmemmap_init(struct hstate *h);
@@ -41,5 +41,5 @@ static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
 {
 	return 0;
 }
-#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 #endif /* _LINUX_HUGETLB_VMEMMAP_H */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index e5ed2680ec57..5b40a7473dc8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -34,7 +34,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
  *
@@ -420,7 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 
 	return 0;
 }
-#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 
 /*
  * Allocate a block of memory to be used to back the virtual memory map
-- 
Gitee


From 301b00fff3e70b91a9bc47af2cabce76f48cdb75 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Thu, 28 Jul 2022 18:06:57 +0800
Subject: [PATCH 124/132] sysctl: add a new register_sysctl_init() interface

mainline inclusion
from mainline-v5.17-rc1
commit 3ddd9a808cee7284931312f2f3e854c9617f44b2
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ddd9a808cee7284931312f2f3e854c9617f44b2

--------------------------------

Patch series "sysctl: first set of kernel/sysctl cleanups", v2.

Finally had time to respin the series of the work we had started last
year on cleaning up the kernel/sysct.c kitchen sink.  People keeps
stuffing their sysctls in that file and this creates a maintenance
burden.  So this effort is aimed at placing sysctls where they actually
belong.

I'm going to split patches up into series as there is quite a bit of
work.

This first set adds register_sysctl_init() for uses of registerting a
sysctl on the init path, adds const where missing to a few places,
generalizes common values so to be more easy to share, and starts the
move of a few kernel/sysctl.c out where they belong.

The majority of rework on v2 in this first patch set is 0-day fixes.
Eric Biederman's feedback is later addressed in subsequent patch sets.

I'll only post the first two patch sets for now.  We can address the
rest once the first two patch sets get completely reviewed / Acked.

This patch (of 9):

The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

Today though folks heavily rely on tables on kernel/sysctl.c so they can
easily just extend this table with their needed sysctls.  In order to
help users move their sysctls out we need to provide a helper which can
be used during code initialization.

We special-case the initialization use of register_sysctl() since it
*is* safe to fail, given all that sysctls do is provide a dynamic
interface to query or modify at runtime an existing variable.  So the
use case of register_sysctl() on init should *not* stop if the sysctls
don't end up getting registered.  It would be counter productive to stop
boot if a simple sysctl registration failed.

Provide a helper for init then, and document the recommended init levels
to use for callers of this routine.  We will later use this in
subsequent patches to start slimming down kernel/sysctl.c tables and
moving sysctl registration to the code which actually needs these
sysctls.

[mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c                  ]

Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org
Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Qing Wang <wangqing@vivo.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Antti Palosaari <crope@iki.fi>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Lukas Middendorf <kernel@tuxforce.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Phillip Potter <phil@philpotter.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Douglas Gilbert <dgilbert@interlog.com>
Cc: James E.J. Bottomley <jejb@linux.ibm.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 fs/proc/proc_sysctl.c  | 33 +++++++++++++++++++++++++++++++++
 include/linux/sysctl.h |  3 +++
 2 files changed, 36 insertions(+)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ffed75f833b7..df435cd91a5b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/bpf-cgroup.h>
 #include <linux/mount.h>
+#include <linux/kmemleak.h>
 #include "internal.h"
 
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -1380,6 +1381,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab
 }
 EXPORT_SYMBOL(register_sysctl);
 
+/**
+ * __register_sysctl_init() - register sysctl table to path
+ * @path: path name for sysctl base
+ * @table: This is the sysctl table that needs to be registered to the path
+ * @table_name: The name of sysctl table, only used for log printing when
+ *              registration fails
+ *
+ * The sysctl interface is used by userspace to query or modify at runtime
+ * a predefined value set on a variable. These variables however have default
+ * values pre-set. Code which depends on these variables will always work even
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
+ * ability to query or modify the sysctls dynamically at run time. Chances of
+ * register_sysctl() failing on init are extremely low, and so for both reasons
+ * this function does not return any error as it is used by initialization code.
+ *
+ * Context: Can only be called after your respective sysctl base path has been
+ * registered. So for instance, most base directories are registered early on
+ * init before init levels are processed through proc_sys_init() and
+ * sysctl_init().
+ */
+void __init __register_sysctl_init(const char *path, struct ctl_table *table,
+				 const char *table_name)
+{
+	struct ctl_table_header *hdr = register_sysctl(path, table);
+
+	if (unlikely(!hdr)) {
+		pr_err("failed when register_sysctl %s to %s\n", table_name, path);
+		return;
+	}
+	kmemleak_not_leak(hdr);
+}
+
 static char *append_path(const char *path, char *pos, const char *name)
 {
 	int namelen;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 51298a4f4623..161eba9fd912 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -195,6 +195,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 void unregister_sysctl_table(struct ctl_table_header * table);
 
 extern int sysctl_init(void);
+extern void __register_sysctl_init(const char *path, struct ctl_table *table,
+				 const char *table_name);
+#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table)
 void do_sysctl_args(void);
 
 extern int pwrsw_enabled;
-- 
Gitee


From 9e38aa2f71a1de04192e4c55f788f9c36f27d0a0 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:58 +0800
Subject: [PATCH 125/132] mm: hugetlb_vmemmap: disable hugetlb_optimize_vmemmap
 when struct page crosses page boundaries

mainline inclusion
from mainline-v5.19-rc1
commit 0effdf461c5789be02d40c1868c70cc02ea24627
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0effdf461c5789be02d40c1868c70cc02ea24627

--------------------------------

Patch series "add hugetlb_optimize_vmemmap sysctl", v11.

This series aims to add hugetlb_optimize_vmemmap sysctl to enable or
disable the feature of optimizing vmemmap pages associated with HugeTLB
pages.

This patch (of 4):

If the size of "struct page" is not the power of two but with the feature
of minimizing overhead of struct page associated with each HugeTLB is
enabled, then the vmemmap pages of HugeTLB will be corrupted after
remapping (panic is about to happen in theory).  But this only exists when
!CONFIG_MEMCG && !CONFIG_SLUB on x86_64.  However, it is not a
conventional configuration nowadays.  So it is not a real word issue, just
the result of a code review.

But we cannot prevent anyone from configuring that combined configure.
This hugetlb_optimize_vmemmap should be disable in this case to fix this
issue.

Link: https://lkml.kernel.org/r/20220512041142.39501-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220512041142.39501-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/hugetlb_vmemmap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 2655434a946b..4b8e59eeb971 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -194,12 +194,6 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
-	/* We cannot optimize if a "struct page" crosses page boundaries. */
-	if (!is_power_of_2(sizeof(struct page))) {
-		pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
-		return 0;
-	}
-
 	if (!buf)
 		return -EINVAL;
 
@@ -285,6 +279,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	if (!hugetlb_optimize_vmemmap_enabled())
 		return;
 
+	if (!is_power_of_2(sizeof(struct page))) {
+		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
+		static_branch_disable(&hugetlb_optimize_vmemmap_key);
+		return;
+	}
+
 	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
 	/*
 	 * The head page is not to be freed to buddy allocator, the other tail
-- 
Gitee


From 6233471f1e8239cde7f7e49a4d9545b17cab51ac Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:06:59 +0800
Subject: [PATCH 126/132] mm: hugetlb_vmemmap: use kstrtobool for
 hugetlb_vmemmap param parsing

mainline inclusion
from mainline-v5.19-rc1
commit 9c54c522bb76cbef480722bd44059e2ba8304bd2
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9c54c522bb76cbef480722bd44059e2ba8304bd2

--------------------------------

Use kstrtobool rather than open coding "on" and "off" parsing in
mm/hugetlb_vmemmap.c, which is more powerful to handle all kinds of
parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off".

Link: https://lkml.kernel.org/r/20220512041142.39501-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 +++---
 mm/hugetlb_vmemmap.c                            | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 86a7ea4a9964..247acf7fc837 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1597,10 +1597,10 @@
 			enabled.
 			Allows heavy hugetlb users to free up some more
 			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
-			Format: { on | off (default) }
+			Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) }
 
-			on:  enable the feature
-			off: disable the feature
+			[oO][Nn]/Y/y/1: enable the feature
+			[oO][Ff]/N/n/0: disable the feature
 
 			Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
 			the default is on.
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4b8e59eeb971..112e74504905 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -194,15 +194,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
-	if (!buf)
+	bool enable;
+
+	if (kstrtobool(buf, &enable))
 		return -EINVAL;
 
-	if (!strcmp(buf, "on"))
+	if (enable)
 		static_branch_enable(&hugetlb_optimize_vmemmap_key);
-	else if (!strcmp(buf, "off"))
-		static_branch_disable(&hugetlb_optimize_vmemmap_key);
 	else
-		return -EINVAL;
+		static_branch_disable(&hugetlb_optimize_vmemmap_key);
 
 	return 0;
 }
-- 
Gitee


From 6b2239c22b56bddcdefd09fd58b6672f2b1f1566 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:07:00 +0800
Subject: [PATCH 127/132] mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap
 sysctl

mainline inclusion
from mainline-v5.19-rc1
commit 78f39084b41d287aedb2ea55f2c1895cfa11d61a
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=78f39084b41d287aedb2ea55f2c1895cfa11d61a

--------------------------------

We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and
reboot the server to enable or disable the feature of optimizing vmemmap
pages associated with HugeTLB pages.  However, rebooting usually takes a
long time.  So add a sysctl to enable or disable the feature at runtime
without rebooting.  Why we need this?  There are 3 use cases.

1) The feature of minimizing overhead of struct page associated with
   each HugeTLB is disabled by default without passing
   "hugetlb_free_vmemmap=on" to the boot cmdline.  When we (ByteDance)
   deliver the servers to the users who want to enable this feature, they
   have to configure the grub (change boot cmdline) and reboot the
   servers, whereas rebooting usually takes a long time (we have thousands
   of servers).  It's a very bad experience for the users.  So we need a
   approach to enable this feature after rebooting.  This is a use case in
   our practical environment.

2) Some use cases are that HugeTLB pages are allocated 'on the fly'
   instead of being pulled from the HugeTLB pool, those workloads would be
   affected with this feature enabled.  Those workloads could be
   identified by the characteristics of they never explicitly allocating
   huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages'
   and then let the pages be allocated from the buddy allocator at fault
   time.  We can confirm it is a real use case from the commit
   099730d67417.  For those workloads, the page fault time could be ~2x
   slower than before.  We suspect those users want to disable this
   feature if the system has enabled this before and they don't think the
   memory savings benefit is enough to make up for the performance drop.

3) If the workload which wants vmemmap pages to be optimized and the
   workload which wants to set 'nr_overcommit_hugepages' and does not want
   the extera overhead at fault time when the overcommitted pages be
   allocated from the buddy allocator are deployed in the same server.
   The user could enable this feature and set 'nr_hugepages' and
   'nr_overcommit_hugepages', then disable the feature.  In this case, the
   overcommited HugeTLB pages will not encounter the extra overhead at
   fault time.

Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	include/linux/memory_hotplug.h
	mm/hugetlb_vmemmap.c
	mm/memory_hotplug.c
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 Documentation/admin-guide/sysctl/vm.rst | 39 +++++++++++
 mm/hugetlb_vmemmap.c                    | 92 ++++++++++++++++++++++---
 2 files changed, 122 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index a5fbef4740c2..5de629b932ae 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -560,6 +560,45 @@ Change the minimum size of the hugepage pool.
 See Documentation/admin-guide/mm/hugetlbpage.rst
 
 
+hugetlb_optimize_vmemmap
+========================
+
+This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
+is configured or the size of 'struct page' (a structure defined in
+include/linux/mm_types.h) is not power of two (an unusual system config could
+result in this).
+
+Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
+associated with each HugeTLB page.
+
+Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
+buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
+per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be
+optimized.  When those optimized HugeTLB pages are freed from the HugeTLB pool
+to the buddy allocator, the vmemmap pages representing that range needs to be
+remapped again and the vmemmap pages discarded earlier need to be rellocated
+again.  If your use case is that HugeTLB pages are allocated 'on the fly' (e.g.
+never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set
+'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on
+the fly') instead of being pulled from the HugeTLB pool, you should weigh the
+benefits of memory savings against the more overhead (~2x slower than before)
+of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy
+allocator.  Another behavior to note is that if the system is under heavy memory
+pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB
+pool to the buddy allocator since the allocation of vmemmap pages could be
+failed, you have to retry later if your system encounter this situation.
+
+Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
+buddy allocator will not be optimized meaning the extra overhead at allocation
+time from buddy allocator disappears, whereas already optimized HugeTLB pages
+will not be affected.  If you want to make sure there are no optimized HugeTLB
+pages, you can set "nr_hugepages" to 0 first and then disable this.  Note that
+writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus
+pages.  So, those surplus pages are still optimized until they are no longer
+in use.  You would need to wait for those surplus pages to be released before
+there are no optimized pages in the system.
+
+
 nr_hugepages_mempolicy
 ======================
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 112e74504905..4340bf6c5551 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -188,21 +188,40 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
+enum vmemmap_optimize_mode {
+	VMEMMAP_OPTIMIZE_OFF,
+	VMEMMAP_OPTIMIZE_ON,
+};
+
 DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 			hugetlb_optimize_vmemmap_key);
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
+static enum vmemmap_optimize_mode vmemmap_optimize_mode =
+	IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+
+static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
+{
+	if (vmemmap_optimize_mode == to)
+		return;
+
+	if (to == VMEMMAP_OPTIMIZE_OFF)
+		static_branch_dec(&hugetlb_optimize_vmemmap_key);
+	else
+		static_branch_inc(&hugetlb_optimize_vmemmap_key);
+	WRITE_ONCE(vmemmap_optimize_mode, to);
+}
+
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
 	bool enable;
+	enum vmemmap_optimize_mode mode;
 
 	if (kstrtobool(buf, &enable))
 		return -EINVAL;
 
-	if (enable)
-		static_branch_enable(&hugetlb_optimize_vmemmap_key);
-	else
-		static_branch_disable(&hugetlb_optimize_vmemmap_key);
+	mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
+	vmemmap_optimize_mode_switch(mode);
 
 	return 0;
 }
@@ -235,8 +254,10 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 	 */
 	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
 				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
-	if (!ret)
+	if (!ret) {
 		ClearHPageVmemmapOptimized(head);
+		static_branch_dec(&hugetlb_optimize_vmemmap_key);
+	}
 
 	return ret;
 }
@@ -250,6 +271,11 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 	if (!vmemmap_pages)
 		return;
 
+	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+		return;
+
+	static_branch_inc(&hugetlb_optimize_vmemmap_key);
+
 	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
 	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
 	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
@@ -259,7 +285,9 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 	 * to the page which @vmemmap_reuse is mapped to, then free the pages
 	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
 	 */
-	if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+	if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+		static_branch_dec(&hugetlb_optimize_vmemmap_key);
+	else
 		SetHPageVmemmapOptimized(head);
 }
 
@@ -276,9 +304,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
 
-	if (!hugetlb_optimize_vmemmap_enabled())
-		return;
-
 	if (!is_power_of_2(sizeof(struct page))) {
 		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
 		static_branch_disable(&hugetlb_optimize_vmemmap_key);
@@ -300,3 +325,52 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	pr_info("can optimize %d vmemmap pages for %s\n",
 		h->optimize_vmemmap_pages, h->name);
 }
+
+#ifdef CONFIG_PROC_SYSCTL
+static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
+					    void *buffer, size_t *length,
+					    loff_t *ppos)
+{
+	int ret;
+	enum vmemmap_optimize_mode mode;
+	static DEFINE_MUTEX(sysctl_mutex);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&sysctl_mutex);
+	mode = vmemmap_optimize_mode;
+	table->data = &mode;
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (write && !ret)
+		vmemmap_optimize_mode_switch(mode);
+	mutex_unlock(&sysctl_mutex);
+
+	return ret;
+}
+
+static struct ctl_table hugetlb_vmemmap_sysctls[] = {
+	{
+		.procname	= "hugetlb_optimize_vmemmap",
+		.maxlen		= sizeof(enum vmemmap_optimize_mode),
+		.mode		= 0644,
+		.proc_handler	= hugetlb_optimize_vmemmap_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
+static __init int hugetlb_vmemmap_sysctls_init(void)
+{
+	/*
+	 * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
+	 * crosses page boundaries, the vmemmap pages cannot be optimized.
+	 */
+	if (is_power_of_2(sizeof(struct page)))
+		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
+
+	return 0;
+}
+late_initcall(hugetlb_vmemmap_sysctls_init);
+#endif /* CONFIG_PROC_SYSCTL */
-- 
Gitee


From 1d2bbc77596574b078cd2b8dec8725d8c38617ea Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 28 Jul 2022 18:07:01 +0800
Subject: [PATCH 128/132] mm: hugetlb_vmemmap: fix
 CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON

mainline inclusion
from mainline-v5.19-rc1
commit 0111def915b280c64c05f73f01b59ca404255aa3
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0111def915b280c64c05f73f01b59ca404255aa3

--------------------------------

The following:

  commit 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*")

forgot to update CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON used in
vmemmap_optimize_mode to CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON.
The result is we cannot enable hugetlb_optimize_vmemmap at boot time when
we configure CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON.  Fix it.

Link: https://lkml.kernel.org/r/20220527081948.68832-1-songmuchun@bytedance.com
Fixes: 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*")
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/hugetlb_vmemmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4340bf6c5551..e9f63cb9e3d4 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -198,7 +198,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
 static enum vmemmap_optimize_mode vmemmap_optimize_mode =
-	IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+	IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 
 static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
 {
-- 
Gitee


From f1cfb6175ca81adf5ad5448ea4ad8cd205acf96b Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Thu, 28 Jul 2022 18:07:02 +0800
Subject: [PATCH 129/132] mm: hugetlb_vmemmap: disable hugetlb_vmemmap when
 dynamic hugetlb is enabled

hulk inclusion
category: feature
bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
CVE: NA

--------------------------------

Disable hugetlb_vmemmap when dynamic hugetlb is enabled.
By the way, fix a similar spelling error.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/dynamic_hugetlb.c |  4 ++--
 mm/huge_memory.c     |  2 +-
 mm/hugetlb_vmemmap.c | 11 ++++++++++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c
index f8ebc8ab7d60..c1b968a7e668 100644
--- a/mm/dynamic_hugetlb.c
+++ b/mm/dynamic_hugetlb.c
@@ -1150,6 +1150,6 @@ static int __init dynamic_hugetlb_setup(char *s)
 {
 	if (!strcmp(s, "on"))
 		enable_dhugetlb = true;
-	return 1;
+	return 0;
 }
-__setup("dynamic_hugetlb=", dynamic_hugetlb_setup);
+early_param("dynamic_hugetlb", dynamic_hugetlb_setup);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bfe079e294cb..79c855b5adad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -401,7 +401,7 @@ static int __init hugepage_init(void)
 	 */
 	if (enable_dhugetlb) {
 		transparent_hugepage_flags = 0;
-		pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n");
+		pr_info("transparent hugepage is disabled due to conflict with dynamic hugetlb\n");
 		return -EINVAL;
 	}
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index e9f63cb9e3d4..7ec8560d267d 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -176,6 +176,7 @@
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
+#include <linux/dynamic_hugetlb.h>
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -304,6 +305,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
 
+	if (enable_dhugetlb) {
+		pr_warn_once("cannot optimize vmemmap pages due to conflict with dynamic hugetlb\n");
+		static_branch_disable(&hugetlb_optimize_vmemmap_key);
+		return;
+	}
+
 	if (!is_power_of_2(sizeof(struct page))) {
 		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
 		static_branch_disable(&hugetlb_optimize_vmemmap_key);
@@ -366,8 +373,10 @@ static __init int hugetlb_vmemmap_sysctls_init(void)
 	/*
 	 * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
 	 * crosses page boundaries, the vmemmap pages cannot be optimized.
+	 * If "dynamic hugetlb" is enabled, the vmemmap pages cannot be
+	 * optimized.
 	 */
-	if (is_power_of_2(sizeof(struct page)))
+	if (is_power_of_2(sizeof(struct page)) && !enable_dhugetlb)
 		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 
 	return 0;
-- 
Gitee


From 3513c1a7443124efaa0bdf2464369288ea139efd Mon Sep 17 00:00:00 2001
From: Liu Xinpeng <liuxp11@chinatelecom.cn>
Date: Tue, 26 Apr 2022 22:53:28 +0800
Subject: [PATCH 130/132] watchdog: wdat_wdt: Using the existing function to
 check parameter timeout

mainline inclusion
from mainline-v5.19-rc1
commit 6d72c7ac9fbe26a77800676507da980436b40b2f
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596BF

Intel-SIG: commit 6d72c7ac9fbe watchdog: wdat_wdt: Using the existing
function to check parameter timeout

-------------------------------------

If max_hw_heartbeat_ms is provided, the configured maximum timeout is not
limited by it. The limit check in this driver therefore doesn't make much
sense. Similar, the watchdog core ensures that minimum timeout limits are
met if min_hw_heartbeat_ms is set. Using watchdog_timeout_invalid() makes
more sense because it takes this into account.

Signed-off-by: Liu Xinpeng <liuxp11@chinatelecom.cn>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/1650984810-6247-2-git-send-email-liuxp11@chinatelecom.cn
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
---
 drivers/watchdog/wdat_wdt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c
index 3065dd670a18..d1f4dc191fbb 100644
--- a/drivers/watchdog/wdat_wdt.c
+++ b/drivers/watchdog/wdat_wdt.c
@@ -344,6 +344,7 @@ static int wdat_wdt_probe(struct platform_device *pdev)
 	wdat->period = tbl->timer_period;
 	wdat->wdd.min_hw_heartbeat_ms = wdat->period * tbl->min_count;
 	wdat->wdd.max_hw_heartbeat_ms = wdat->period * tbl->max_count;
+	wdat->wdd.min_timeout = 1;
 	wdat->stopped_in_sleep = tbl->flags & ACPI_WDAT_STOPPED;
 	wdat->wdd.info = &wdat_wdt_info;
 	wdat->wdd.ops = &wdat_wdt_ops;
@@ -450,8 +451,7 @@ static int wdat_wdt_probe(struct platform_device *pdev)
 	 * watchdog properly after it has opened the device. In some cases
 	 * the BIOS default is too short and causes immediate reboot.
 	 */
-	if (timeout * 1000 < wdat->wdd.min_hw_heartbeat_ms ||
-	    timeout * 1000 > wdat->wdd.max_hw_heartbeat_ms) {
+	if (watchdog_timeout_invalid(&wdat->wdd, timeout)) {
 		dev_warn(dev, "Invalid timeout %d given, using %d\n",
 			 timeout, WDAT_DEFAULT_TIMEOUT);
 		timeout = WDAT_DEFAULT_TIMEOUT;
-- 
Gitee


From 3c71105b0bc6268f9cc72187dc04ac7519d2d2e1 Mon Sep 17 00:00:00 2001
From: Liu Xinpeng <liuxp11@chinatelecom.cn>
Date: Tue, 26 Apr 2022 22:53:29 +0800
Subject: [PATCH 131/132] watchdog: wdat_wdt: Stop watchdog when rebooting the
 system

mainline inclusion
from mainline-v5.19-rc1
commit 27fdf84510a1374748904db43f6755f912736d92
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596BF

Intel-SIG: commit 27fdf84510a1 watchdog: wdat_wdt: Stop watchdog when
rebooting the system

-------------------------------------

Executing reboot command several times on the machine "Dell
PowerEdge R740", UEFI security detection stopped machine
with the following prompt:

UEFI0082: The system was reset due to a timeout from the watchdog
timer. Check the System Event Log (SEL) or crash dumps from
Operating Sysstem to identify the source that triggered the
watchdog timer reset. Update the firmware or driver for the
identified device.

iDRAC has warning event: "The watchdog timer reset the system".

This patch fixes this issue by adding the reboot notifier.

Signed-off-by: Liu Xinpeng <liuxp11@chinatelecom.cn>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/1650984810-6247-3-git-send-email-liuxp11@chinatelecom.cn
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
---
 drivers/watchdog/wdat_wdt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c
index d1f4dc191fbb..1f0231699c17 100644
--- a/drivers/watchdog/wdat_wdt.c
+++ b/drivers/watchdog/wdat_wdt.c
@@ -462,6 +462,7 @@ static int wdat_wdt_probe(struct platform_device *pdev)
 		return ret;
 
 	watchdog_set_nowayout(&wdat->wdd, nowayout);
+	watchdog_stop_on_reboot(&wdat->wdd);
 	return devm_watchdog_register_device(dev, &wdat->wdd);
 }
 
-- 
Gitee


From 190fb72a92b47a89bb2066a414a0cecfab9d5348 Mon Sep 17 00:00:00 2001
From: Liu Xinpeng <liuxp11@chinatelecom.cn>
Date: Tue, 26 Apr 2022 22:53:30 +0800
Subject: [PATCH 132/132] watchdog: wdat_wdt: Stop watchdog when uninstalling
 module

mainline inclusion
from mainline-v5.19-rc1
commit 330415ebea81b65842e4cc6d2fd985c1b369e650
category: bugfix
bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596BF

Intel-SIG: commit 330415ebea81 watchdog: wdat_wdt: Stop watchdog when
uninstalling module

-------------------------------------

Test shows that wachdog still reboots machine after the module
is removed. Use watchdog_stop_on_unregister to stop the watchdog
on removing.

Signed-off-by: Liu Xinpeng <liuxp11@chinatelecom.cn>
eviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/1650984810-6247-4-git-send-email-liuxp11@chinatelecom.cn
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
Signed-off-by: Yunying Sun <yunying.sun@intel.com>
---
 drivers/watchdog/wdat_wdt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c
index 1f0231699c17..1bd4c8d89ed4 100644
--- a/drivers/watchdog/wdat_wdt.c
+++ b/drivers/watchdog/wdat_wdt.c
@@ -463,6 +463,7 @@ static int wdat_wdt_probe(struct platform_device *pdev)
 
 	watchdog_set_nowayout(&wdat->wdd, nowayout);
 	watchdog_stop_on_reboot(&wdat->wdd);
+	watchdog_stop_on_unregister(&wdat->wdd);
 	return devm_watchdog_register_device(dev, &wdat->wdd);
 }
 
-- 
Gitee