From 8b038d52a6cb7e735aaa5f100cecb70cba382981 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 13 Jul 2022 12:19:58 +0800 Subject: [PATCH 001/132] mm/hwpoison: do not lock page again when me_huge_page() successfully recovers mainline inclusion from mainline-v5.13 commit ea6d0630100b285f059d0a8d8e86f38a46407536 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel_SIG: commit ea6d0630100b mm/hwpoison: do not lock page again when me_huge_page() successfully recovers. Backport for MCA recovery enhancing & bug fix. -------------------------------- Currently me_huge_page() temporary unlocks page to perform some actions then locks it again later. My testcase (which calls hard-offline on some tail page in a hugetlb, then accesses the address of the hugetlb range) showed that page allocation code detects this page lock on buddy page and printed out "BUG: Bad page state" message. check_new_page_bad() does not consider a page with __PG_HWPOISON as bad page, so this flag works as kind of filter, but this filtering doesn't work in this case because the "bad page" is not the actual hwpoisoned page. So stop locking page again. Actions to be taken depend on the page type of the error, so page unlocking should be done in ->action() callbacks. So let's make it assumed and change all existing callbacks that way. Link: https://lkml.kernel.org/r/20210609072029.74645-1-nao.horiguchi@gmail.com Fixes: commit 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error") Signed-off-by: Naoya Horiguchi Cc: Oscar Salvador Cc: Michal Hocko Cc: Tony Luck Cc: "Aneesh Kumar K.V" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Youquan Song --- mm/memory-failure.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0519f20d2b57..74a21042845a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, */ static int me_kernel(struct page *p, unsigned long pfn) { + unlock_page(p); return MF_IGNORED; } @@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + unlock_page(p); return MF_FAILED; } @@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn) */ static int me_pagecache_clean(struct page *p, unsigned long pfn) { + int ret; struct address_space *mapping; delete_from_lru_cache(p); @@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * For anonymous pages we're done the only reference left * should be the one m_f() holds. */ - if (PageAnon(p)) - return MF_RECOVERED; + if (PageAnon(p)) { + ret = MF_RECOVERED; + goto out; + } /* * Now truncate the page in the page cache. This is really @@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return MF_FAILED; + ret = MF_FAILED; + goto out; } /* @@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - return truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, pfn, mapping); +out: + unlock_page(p); + return ret; } /* @@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) */ static int me_swapcache_dirty(struct page *p, unsigned long pfn) { + int ret; + ClearPageDirty(p); /* Trigger EIO in shmem: */ ClearPageUptodate(p); - if (!delete_from_lru_cache(p)) - return MF_DELAYED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; + unlock_page(p); + return ret; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { + int ret; + delete_from_swap_cache(p); - if (!delete_from_lru_cache(p)) - return MF_RECOVERED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; + unlock_page(p); + return ret; } /* @@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, pfn, mapping); + unlock_page(hpage); } else { res = MF_FAILED; unlock_page(hpage); @@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) page_ref_inc(p); res = MF_RECOVERED; } - lock_page(hpage); } return res; @@ -866,6 +877,8 @@ static struct page_state { unsigned long mask; unsigned long res; enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ int (*action)(struct page *p, unsigned long pfn); } error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, @@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p, int result; int count; + /* page p should be unlocked after returning from ps->action(). */ result = ps->action(p, pfn); count = page_count(p) - 1; @@ -1246,7 +1260,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) goto out; } - res = identify_page_state(pfn, p, page_flags); + return identify_page_state(pfn, p, page_flags); out: unlock_page(head); return res; @@ -1529,6 +1543,8 @@ int memory_failure(unsigned long pfn, int flags) identify_page_state: res = identify_page_state(pfn, p, page_flags); + mutex_unlock(&mf_mutex); + return res; unlock_page: unlock_page(p); unlock_mutex: -- Gitee From 75a005c5418abcb879bdf3053d0853956cec7e73 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 28 Jun 2021 19:43:14 -0700 Subject: [PATCH 002/132] mm,hwpoison: send SIGBUS with error virutal address mainline inclusion from mainline-v5.14-rc1 commit a3f5d80ea401ac857f2910e28b15f35b2cf902f4 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit a3f5d80ea401 mm,hwpoison: send SIGBUS with error virutal address. Backport for MCA recovery enhancing & bug fix. -------------------------------- Now an action required MCE in already hwpoisoned address surely sends a SIGBUS to current process, but the SIGBUS doesn't convey error virtual address. That's not optimal for hwpoison-aware applications. To fix the issue, make memory_failure() call kill_accessing_process(), that does pagetable walk to find the error virtual address. It could find multiple virtual addresses for the same error page, and it seems hard to tell which virtual address is correct one. But that's rare and sending incorrect virtual address could be better than no address. So let's report the first found virtual address for now. [naoya.horiguchi@nec.com: fix walk_page_range() return] Link: https://lkml.kernel.org/r/20210603051055.GA244241@hori.linux.bs1.fc.nec.co.jp Link: https://lkml.kernel.org/r/20210521030156.2612074-4-nao.horiguchi@gmail.com Signed-off-by: Naoya Horiguchi Cc: Tony Luck Cc: Aili Yao Cc: Oscar Salvador Cc: David Hildenbrand Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Jue Wang Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Youquan Song --- arch/x86/kernel/cpu/mce/core.c | 13 ++- include/linux/swapops.h | 5 ++ mm/memory-failure.c | 150 ++++++++++++++++++++++++++++++++- 3 files changed, 165 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 0ced8b250090..cdda2198deae 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1271,6 +1271,7 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + int ret; p->mce_count = 0; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); @@ -1278,13 +1279,21 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && - !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; } + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, so no need to + * send SIGBUS here again. + */ + if (ret == -EHWPOISON) + return; + if (p->mce_vaddr != (void __user *)-1l) { force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); } else { diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 0d429a102d41..708fbeb21dd3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -332,6 +332,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } +static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry) +{ + return swp_offset(entry); +} + static inline void num_poisoned_pages_inc(void) { atomic_long_inc(&num_poisoned_pages); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 74a21042845a..0035e8ca4e49 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -554,6 +555,148 @@ void collect_procs(struct page *page, struct list_head *tokill, } EXPORT_SYMBOL_GPL(collect_procs); +struct hwp_walk { + struct to_kill tk; + unsigned long pfn; + int flags; +}; + +static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift) +{ + tk->addr = addr; + tk->size_shift = shift; +} + +static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, + unsigned long poisoned_pfn, struct to_kill *tk) +{ + unsigned long pfn = 0; + + if (pte_present(pte)) { + pfn = pte_pfn(pte); + } else { + swp_entry_t swp = pte_to_swp_entry(pte); + + if (is_hwpoison_entry(swp)) + pfn = hwpoison_entry_to_pfn(swp); + } + + if (!pfn || pfn != poisoned_pfn) + return 0; + + set_to_kill(tk, addr, shift); + return 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + pmd_t pmd = *pmdp; + unsigned long pfn; + unsigned long hwpoison_vaddr; + + if (!pmd_present(pmd)) + return 0; + pfn = pmd_pfn(pmd); + if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) { + hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT); + set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT); + return 1; + } + return 0; +} +#else +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + return 0; +} +#endif + +static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + int ret = 0; + pte_t *ptep; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmdp, walk->vma); + if (ptl) { + ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmdp)) + goto out; + + ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { + ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + hwp->pfn, &hwp->tk); + if (ret == 1) + break; + } + pte_unmap_unlock(ptep - 1, ptl); +out: + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + pte_t pte = huge_ptep_get(ptep); + struct hstate *h = hstate_vma(walk->vma); + + return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); +} +#else +#define hwpoison_hugetlb_range NULL +#endif + +static struct mm_walk_ops hwp_walk_ops = { + .pmd_entry = hwpoison_pte_range, + .hugetlb_entry = hwpoison_hugetlb_range, +}; + +/* + * Sends SIGBUS to the current process with error info. + * + * This function is intended to handle "Action Required" MCEs on already + * hardware poisoned pages. They could happen, for example, when + * memory_failure() failed to unmap the error page at the first call, or + * when multiple local machine checks happened on different CPUs. + * + * MCE handler currently has no easy access to the error virtual address, + * so this function walks page table to find it. The returned virtual address + * is proper in most cases, but it could be wrong when the application + * process has multiple entries mapping the error page. + */ +static int kill_accessing_process(struct task_struct *p, unsigned long pfn, + int flags) +{ + int ret; + struct hwp_walk priv = { + .pfn = pfn, + }; + priv.tk.tsk = p; + + mmap_read_lock(p->mm); + ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, + (void *)&priv); + if (ret == 1 && priv.tk.addr) + kill_proc(&priv.tk, pfn, flags); + mmap_read_unlock(p->mm); + return ret ? -EFAULT : -EHWPOISON; +} + static const char *action_name[] = { [MF_IGNORED] = "Ignored", [MF_FAILED] = "Failed", @@ -1204,7 +1347,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestSetPageHWPoison(head)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return -EHWPOISON; + res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, page_to_pfn(head), flags); + return res; } num_poisoned_pages_inc(); @@ -1409,6 +1555,8 @@ int memory_failure(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, pfn, flags); goto unlock_mutex; } -- Gitee From d1b9413b3213dc72557fc4cdb01ee3ab18d716f4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 17 Aug 2021 17:29:41 -0700 Subject: [PATCH 003/132] x86/mce: Change to not send SIGBUS error during copy from user mainline inclusion from mainline-v5.16-rc1 commit a6e3cf70b772541c2388abdb86e5a562cfe18e63 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit a6e3cf70b772 x86/mce: Change to not send SIGBUS error during copy from user. Backport for MCA recovery enhance and bug fix. -------------------------------- Sending a SIGBUS for a copy from user is not the correct semantic. System calls should return -EFAULT (or a short count for write(2)). Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210818002942.1607544-3-tony.luck@intel.com Signed-off-by: Youquan Song --- arch/x86/kernel/cpu/mce/core.c | 36 +++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index cdda2198deae..8aeb56f5e577 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1280,7 +1280,7 @@ static void kill_me_maybe(struct callback_head *cb) flags |= MF_MUST_KILL; ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); - if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + if (!ret) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; @@ -1294,15 +1294,21 @@ static void kill_me_maybe(struct callback_head *cb) if (ret == -EHWPOISON) return; - if (p->mce_vaddr != (void __user *)-1l) { - force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); - } else { - pr_err("Memory error not recovered"); - kill_me_now(cb); - } + pr_err("Memory error not recovered"); + kill_me_now(cb); } -static void queue_task_work(struct mce *m, char *msg, int kill_current_task) +static void kill_me_never(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + + p->mce_count = 0; + pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); +} + +static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; @@ -1312,11 +1318,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task) current->mce_kflags = m->kflags; current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); current->mce_whole_page = whole_page(m); - - if (kill_current_task) - current->mce_kill_me.func = kill_me_now; - else - current->mce_kill_me.func = kill_me_maybe; + current->mce_kill_me.func = func; } /* Ten is likely overkill. Don't expect more than two faults before task_work() */ @@ -1486,8 +1488,10 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, msg, kill_it); - + if (kill_it) + queue_task_work(&m, msg, kill_me_now); + else + queue_task_work(&m, msg, kill_me_maybe); } else { /* * Handle an MCE which has happened in kernel space but from @@ -1504,7 +1508,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_it); + queue_task_work(&m, msg, kill_me_never); } instrumentation_end(); -- Gitee From 4bc1efa1413b9bdfaa10be0f049f19e2a2266253 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 31 May 2021 00:32:44 -0400 Subject: [PATCH 004/132] generic_perform_write()/iomap_write_actor(): saner logics for short copy mainline inclusion from mainline-v5.13 commit bc1bb416bbb9203e250f5c49aaf1d11b5d9c8adb category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit bc1bb416bbb9 generic_perform_write()/iomap_write_actor(): saner logics for short copy. Backport for MCA recovery enhance and bug fix. -------------------------------- if we run into a short copy and ->write_end() refuses to advance at all, use the amount we'd managed to copy for the next iteration to handle. Signed-off-by: Al Viro Signed-off-by: Youquan Song --- fs/iomap/buffered-io.c | 25 ++++++++++--------------- mm/filemap.c | 24 +++++++++--------------- 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3ec494f5d7ee..d5246e277f17 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -771,10 +771,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. */ if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; @@ -791,25 +787,24 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - copied = iomap_write_end(inode, pos, bytes, copied, page, iomap, + status = iomap_write_end(inode, pos, bytes, copied, page, iomap, srcmap); cond_resched(); - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { + if (unlikely(status == 0)) { /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. + * A short copy made iomap_write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); + if (copied) + bytes = copied; goto again; } + copied = status; + iov_iter_advance(i, copied); pos += copied; written += copied; length -= copied; diff --git a/mm/filemap.c b/mm/filemap.c index 3958fc3280d8..80c5c418eba8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3510,10 +3510,6 @@ ssize_t generic_perform_write(struct file *file, * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. */ if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; @@ -3540,24 +3536,22 @@ ssize_t generic_perform_write(struct file *file, page, fsdata); if (unlikely(status < 0)) break; - copied = status; cond_resched(); - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { + if (unlikely(status == 0)) { /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. + * A short copy made ->write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); + if (copied) + bytes = copied; goto again; } + copied = status; + iov_iter_advance(i, copied); pos += copied; written += copied; -- Gitee From a1728f8de83520f93310fc7409c502358ca46fdd Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 17 Aug 2021 17:29:42 -0700 Subject: [PATCH 005/132] x86/mce: Drop copyin special case for #MC mainline inclusion from mainline-v5.16-rc1 commit 690658471b5f28d306e6492c4585d748cb5304e8 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 690658471b5f x86/mce: Drop copyin special case for #MC. Backport for MCA recovery enhancing & bug fix. -------------------------------- Fixes to the iterator code to handle faults that are not on page boundaries mean that the special case for machine check during copy from user is no longer needed. For a full list of those fixes, see the output of: git log --oneline v5.14 ^v5.13 -- lib/iov_iter.c Intel-SIG: commit 690658471b5f x86/mce: Drop copyin special case for #MC. backport for MCA recovery enhance and bug fix. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210818002942.1607544-4-tony.luck@intel.com Signed-off-by: Youquan Song --- arch/x86/lib/copy_user_64.S | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 77b9b2a3b5c8..e0e71ca023ce 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -234,24 +234,11 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) movl %edx,%ecx - cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ - je 3f 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret - /* - * Return zero to pretend that this copy succeeded. This - * is counter-intuitive, but needed to prevent the code - * in lib/iov_iter.c from retrying and running back into - * the poison cache line again. The machine check handler - * will ensure that a SIGBUS is sent to the task. - */ -3: xorl %eax,%eax - ASM_CLAC - ret - _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) -- Gitee From 3ea63cb74c8cf3446e0caf8c32682ae92d31a58f Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Thu, 23 Dec 2021 12:07:01 -0800 Subject: [PATCH 006/132] x86/mce: Reduce number of machine checks taken during recovery mainline inclusion from mainline-v5.17-rc1 commit 3376136300a00df9a864b88fa969177d6c3be8e5 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 3376136300a0 x86/mce: Reduce number of machine checks taken during recovery. Backport for MCA recovery enhancing & bug fix. -------------------------------- When any of the copy functions in arch/x86/lib/copy_user_64.S take a fault, the fixup code copies the remaining byte count from %ecx to %edx and unconditionally jumps to .Lcopy_user_handle_tail to continue the copy in case any more bytes can be copied. If the fault was #PF this may copy more bytes (because the page fault handler might have fixed the fault). But when the fault is a machine check the original copy code will have copied all the way to the poisoned cache line. So .Lcopy_user_handle_tail will just take another machine check for no good reason. Every code path to .Lcopy_user_handle_tail comes from an exception fixup path, so add a check there to check the trap type (in %eax) and simply return the count of remaining bytes if the trap was a machine check. Doing this reduces the number of machine checks taken during synthetic tests from four to three. As well as reducing the number of machine checks, this also allows Skylake generation Xeons to recover some cases that currently fail. The is because REP; MOVSB is only recoverable when source and destination are well aligned and the byte count is large. That useless call to .Lcopy_user_handle_tail may violate one or more of these conditions and generate a fatal machine check. [ Tony: Add more details to commit message. ] [ bp: Fixup comment. Also, another tip patchset which is adding straight-line speculation mitigation changes the "ret" instruction to an all-caps macro "RET". But, since gas is case-insensitive, use "RET" in the newly added asm block already in order to simplify tip branch merging on its way upstream. ] Signed-off-by: Youquan Song Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/YcTW5dh8yTGucDd+@agluck-desk2.amr.corp.intel.com Signed-off-by: Youquan Song --- arch/x86/lib/copy_user_64.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index e0e71ca023ce..403ba3eb4b84 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -225,6 +225,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * Don't try to copy the tail if machine check happened * * Input: + * eax trap number written by ex_handler_copy() * rdi destination * rsi source * rdx count @@ -233,12 +234,20 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * eax uncopied bytes or 0 if successful. */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) + cmp $X86_TRAP_MC,%eax + je 3f + movl %edx,%ecx 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret +3: + movl %edx,%eax + ASM_CLAC + RET + _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) -- Gitee From 297f05904ec5352fa767b706f70b7bf9d88ab986 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Sun, 6 Feb 2022 15:10:14 +0800 Subject: [PATCH 007/132] mm/hwpoison: fix error page recovered but reported "not recovered" mainline inclusion from mainline-v5.18-rc1 commit 046545a661af2beec21de7b90ca0e35f05088a81 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 046545a661af mm/hwpoison: fix error page recovered but reported "not recovered". Backport for MCA recovery enhancing & bug fix. -------------------------------- When an uncorrected memory error is consumed there is a race between the CMCI from the memory controller reporting an uncorrected error with a UCNA signature, and the core reporting and SRAR signature machine check when the data is about to be consumed. If the CMCI wins that race, the page is marked poisoned when uc_decode_notifier() calls memory_failure() and the machine check processing code finds the page already poisoned. It calls kill_accessing_process() to make sure a SIGBUS is sent. But returns the wrong error code. Console log looks like this: mce: Uncorrected hardware memory error in user-access at 3710b3400 Memory failure: 0x3710b3: recovery action for dirty LRU page: Recovered Memory failure: 0x3710b3: already hardware poisoned Memory failure: 0x3710b3: Sending SIGBUS to einj_mem_uc:361438 due to hardware memory corruption mce: Memory error not recovered kill_accessing_process() is supposed to return -EHWPOISON to notify that SIGBUS is already set to the process and kill_me_maybe() doesn't have to send it again. But current code simply fails to do this, so fix it to make sure to work as intended. This change avoids the noise message "Memory error not recovered" and skips duplicate SIGBUSs. [tony.luck@intel.com: reword some parts of commit message] Link: https://lkml.kernel.org/r/20220113231117.1021405-1-naoya.horiguchi@linux.dev Fixes: a3f5d80ea401 ("mm,hwpoison: send SIGBUS with error virutal address") Signed-off-by: Naoya Horiguchi Reported-by: Youquan Song Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Youquan Song --- mm/memory-failure.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0035e8ca4e49..7c7769ff0f1d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -693,8 +693,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); + else + ret = 0; mmap_read_unlock(p->mm); - return ret ? -EFAULT : -EHWPOISON; + return ret > 0 ? -EHWPOISON : -EFAULT; } static const char *action_name[] = { -- Gitee From 928722d827c692d23c11f40980c7edc536a58cd9 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 17 Nov 2020 20:49:52 +0800 Subject: [PATCH 008/132] EDAC: Add DDR5 new memory type mainline inclusion from mainline-v5.13 commit bc1c99a5971aa7571e8b9731c28fa32abe12cab8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel_SIG: commit bc1c99a5971a EDAC: Add DDR5 new memory type. Backport for EDAC enhancing & bug fix. -------------------------------- Add a new entry to 'enum mem_type' and a new string to 'edac_mem_types[]' for DDR5 new memory type. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Signed-off-by: Youquan Song --- drivers/edac/edac_mc.c | 1 + include/linux/edac.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index f4eb071327be..34514c638f19 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -161,6 +161,7 @@ const char * const edac_mem_types[] = { [MEM_DDR4] = "Unbuffered-DDR4", [MEM_RDDR4] = "Registered-DDR4", [MEM_LRDDR4] = "Load-Reduced-DDR4-RAM", + [MEM_DDR5] = "Unbuffered-DDR5", [MEM_NVDIMM] = "Non-volatile-RAM", }; EXPORT_SYMBOL_GPL(edac_mem_types); diff --git a/include/linux/edac.h b/include/linux/edac.h index 15e8f3d8a895..6c4565cc6273 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -179,6 +179,7 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_RDDR4: Registered DDR4 RAM * This is a variant of the DDR4 memories. * @MEM_LRDDR4: Load-Reduced DDR4 memory. + * @MEM_DDR5: Unbuffered DDR5 RAM * @MEM_NVDIMM: Non-volatile RAM */ enum mem_type { @@ -203,6 +204,7 @@ enum mem_type { MEM_DDR4, MEM_RDDR4, MEM_LRDDR4, + MEM_DDR5, MEM_NVDIMM, }; @@ -226,6 +228,7 @@ enum mem_type { #define MEM_FLAG_DDR4 BIT(MEM_DDR4) #define MEM_FLAG_RDDR4 BIT(MEM_RDDR4) #define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) +#define MEM_FLAG_DDR5 BIT(MEM_DDR5) #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) /** -- Gitee From 432fad27ed99567be07b4ebfbb7aa50b8856c83e Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 17 Nov 2020 20:49:53 +0800 Subject: [PATCH 009/132] EDAC/i10nm: Add Intel Sapphire Rapids server support mainline inclusion from mainline-v5.11-rc1 commit 479f58dda25bb46daeb937f124718e8b4aea6781 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel_SIG: commit 479f58dda25b EDAC/i10nm: Add Intel Sapphire Rapids server support. Backport for add EDAC SPR suppporting. -------------------------------- The Sapphire Rapids CPU model shares the same memory controller architecture with Ice Lake server. There are some configurations different from Ice Lake server as below: - The device ID for configuration agent. - The size for per channel memory-mapped I/O. - The DDR5 memory support. So add the above configurations and the Sapphire Rapids CPU model ID for EDAC support. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 34 +++++++++++++++++++++++++--------- drivers/edac/skx_base.c | 6 +++--- drivers/edac/skx_common.c | 23 ++++++++++++++++++----- drivers/edac/skx_common.h | 16 ++++++++++++---- 4 files changed, 58 insertions(+), 21 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 3a7362f968c9..4f7f9970a901 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -13,7 +13,7 @@ #include "edac_module.h" #include "skx_common.h" -#define I10NM_REVISION "v0.0.3" +#define I10NM_REVISION "v0.0.4" #define EDAC_MOD_STR "i10nm_edac" /* Debug macros */ @@ -25,11 +25,13 @@ #define I10NM_GET_IMC_BAR(d, i, reg) \ pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ - readl((m)->mbase + 0x2080c + (i) * 0x4000 + (j) * 4) + readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ - readl((m)->mbase + 0x20970 + (i) * 0x4000) + readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz) #define I10NM_GET_MCMTR(m, i) \ - readl((m)->mbase + 0x20ef8 + (i) * 0x4000) + readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz) +#define I10NM_GET_AMAP(m, i) \ + readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz) #define I10NM_GET_SCK_MMIO_BASE(reg) (GET_BITFIELD(reg, 0, 28) << 23) #define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12) @@ -129,12 +131,22 @@ static struct res_config i10nm_cfg0 = { .type = I10NM, .decs_did = 0x3452, .busno_cfg_offset = 0xcc, + .ddr_chan_mmio_sz = 0x4000, }; static struct res_config i10nm_cfg1 = { .type = I10NM, .decs_did = 0x3452, .busno_cfg_offset = 0xd0, + .ddr_chan_mmio_sz = 0x4000, +}; + +static struct res_config spr_cfg = { + .type = SPR, + .decs_did = 0x3252, + .busno_cfg_offset = 0xd0, + .ddr_chan_mmio_sz = 0x8000, + .support_ddr5 = true, }; static const struct x86_cpu_id i10nm_cpuids[] = { @@ -143,6 +155,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); @@ -157,12 +170,13 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan) return !!GET_BITFIELD(mcmtr, 2, 2); } -static int i10nm_get_dimm_config(struct mem_ctl_info *mci) +static int i10nm_get_dimm_config(struct mem_ctl_info *mci, + struct res_config *cfg) { struct skx_pvt *pvt = mci->pvt_info; struct skx_imc *imc = pvt->imc; + u32 mtr, amap, mcddrtcfg; struct dimm_info *dimm; - u32 mtr, mcddrtcfg; int i, j, ndimms; for (i = 0; i < I10NM_NUM_CHANNELS; i++) { @@ -171,6 +185,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci) ndimms = 0; mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); + amap = I10NM_GET_AMAP(imc, i); for (j = 0; j < I10NM_NUM_DIMMS; j++) { dimm = edac_get_dimm(mci, i, j, 0); mtr = I10NM_GET_DIMMMTR(imc, i, j); @@ -178,8 +193,8 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci) mtr, mcddrtcfg, imc->mc, i, j); if (IS_DIMM_PRESENT(mtr)) - ndimms += skx_get_dimm_info(mtr, 0, 0, dimm, - imc, i, j); + ndimms += skx_get_dimm_info(mtr, 0, amap, dimm, + imc, i, j, cfg); else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) ndimms += skx_get_nvdimm_info(dimm, imc, i, j, EDAC_MOD_STR); @@ -306,10 +321,11 @@ static int __init i10nm_init(void) d->imc[i].lmc = i; d->imc[i].src_id = src_id; d->imc[i].node_id = node_id; + d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz; rc = skx_register_mci(&d->imc[i], d->imc[i].mdev, "Intel_10nm Socket", EDAC_MOD_STR, - i10nm_get_dimm_config); + i10nm_get_dimm_config, cfg); if (rc < 0) goto fail; } diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index f887e3166651..4dbd46575bfb 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -174,7 +174,7 @@ static bool skx_check_ecc(u32 mcmtr) return !!GET_BITFIELD(mcmtr, 2, 2); } -static int skx_get_dimm_config(struct mem_ctl_info *mci) +static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) { struct skx_pvt *pvt = mci->pvt_info; u32 mtr, mcmtr, amap, mcddrtcfg; @@ -195,7 +195,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci) pci_read_config_dword(imc->chan[i].cdev, 0x80 + 4 * j, &mtr); if (IS_DIMM_PRESENT(mtr)) { - ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j); + ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j, cfg); } else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) { ndimms += skx_get_nvdimm_info(dimm, imc, i, j, EDAC_MOD_STR); @@ -705,7 +705,7 @@ static int __init skx_init(void) d->imc[i].node_id = node_id; rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, "Skylake Socket", EDAC_MOD_STR, - skx_get_dimm_config); + skx_get_dimm_config, cfg); if (rc < 0) goto fail; } diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 2b4ce8e5ac2f..81c3e2ec6f56 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -304,15 +304,25 @@ static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add, #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols") int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, - struct skx_imc *imc, int chan, int dimmno) + struct skx_imc *imc, int chan, int dimmno, + struct res_config *cfg) { - int banks = 16, ranks, rows, cols, npages; + int banks, ranks, rows, cols, npages; + enum mem_type mtype; u64 size; ranks = numrank(mtr); rows = numrow(mtr); cols = numcol(mtr); + if (cfg->support_ddr5 && (amap & 0x8)) { + banks = 32; + mtype = MEM_DDR5; + } else { + banks = 16; + mtype = MEM_DDR4; + } + /* * Compute size in 8-byte (2^3) words, then shift to MiB (2^20) */ @@ -332,7 +342,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, dimm->nr_pages = npages; dimm->grain = 32; dimm->dtype = get_width(mtr); - dimm->mtype = MEM_DDR4; + dimm->mtype = mtype; dimm->edac_mode = EDAC_SECDED; /* likely better than this */ snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", imc->src_id, imc->lmc, chan, dimmno); @@ -390,7 +400,8 @@ int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, const char *ctl_name, const char *mod_str, - get_dimm_config_f get_dimm_config) + get_dimm_config_f get_dimm_config, + struct res_config *cfg) { struct mem_ctl_info *mci; struct edac_mc_layer layers[2]; @@ -425,13 +436,15 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, } mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM; + if (cfg->support_ddr5) + mci->mtype_cap |= MEM_FLAG_DDR5; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; mci->mod_name = mod_str; mci->dev_name = pci_name(pdev); mci->ctl_page_to_phys = NULL; - rc = get_dimm_config(mci); + rc = get_dimm_config(mci, cfg); if (rc < 0) goto fail; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 78f8c1de0b71..bf56bebff138 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -59,6 +59,7 @@ struct skx_dev { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ void __iomem *mbase; /* for i10nm CPU */ + int chan_mmio_sz; /* for i10nm CPU */ u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id, node_id; @@ -82,7 +83,8 @@ struct skx_pvt { enum type { SKX, - I10NM + I10NM, + SPR }; enum { @@ -118,9 +120,13 @@ struct res_config { unsigned int decs_did; /* Default bus number configuration register offset */ int busno_cfg_offset; + /* Per DDR channel memory-mapped I/O size */ + int ddr_chan_mmio_sz; + bool support_ddr5; }; -typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci); +typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, + struct res_config *cfg); typedef bool (*skx_decode_f)(struct decoded_addr *res); typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len); @@ -136,14 +142,16 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list); int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm); int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, - struct skx_imc *imc, int chan, int dimmno); + struct skx_imc *imc, int chan, int dimmno, + struct res_config *cfg); int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, int chan, int dimmno, const char *mod_str); int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, const char *ctl_name, const char *mod_str, - get_dimm_config_f get_dimm_config); + get_dimm_config_f get_dimm_config, + struct res_config *cfg); int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data); -- Gitee From 3dd707aa7c8e5d5cda72390ee0d22508944bdb77 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Wed, 18 Aug 2021 10:57:01 -0700 Subject: [PATCH 010/132] EDAC/i10nm: Retrieve and print retry_rd_err_log registers mainline inclusion from mainline-v5.15-rc1 commit cf4e6d52f58399c777276172ec250502e19d5e63 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit cf4e6d52f583 EDAC/i10nm: Retrieve and print retry_rd_err_log registers. Backport for EDAC retry_rd_err_log for ICX/SPR support. -------------------------------- Retrieve and print retry_rd_err_log registers like the earlier change: commit e80634a75aba ("EDAC, skx: Retrieve and print retry_rd_err_log registers") This is a little trickier than on Skylake because of potential interference with BIOS use of the same registers. The default behavior is to ignore these registers. A module parameter retry_rd_err_log(default=0) controls the mode of operation: - 0=off : Default. - 1=bios : Linux doesn't reset any control bits, but just reports values. This is "no harm" mode, but it may miss reporting some data. - 2=linux: Linux tries to take control and resets mode bits, clears valid/UC bits after reading. This should be more reliable (especially if BIOS interference is reduced by disabling eMCA reporting mode in BIOS setup). Co-developed-by: Qiuxu Zhuo Signed-off-by: Qiuxu Zhuo Signed-off-by: Youquan Song Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210818175701.1611513-3-tony.luck@intel.com Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 146 ++++++++++++++++++++++++++++++++++++++ drivers/edac/skx_base.c | 3 +- drivers/edac/skx_common.c | 4 +- drivers/edac/skx_common.h | 7 +- 4 files changed, 157 insertions(+), 3 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 4f7f9970a901..7798de1e2aed 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -32,14 +32,137 @@ readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz) #define I10NM_GET_AMAP(m, i) \ readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz) +#define I10NM_GET_REG32(m, i, offset) \ + readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) +#define I10NM_GET_REG64(m, i, offset) \ + readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) +#define I10NM_SET_REG32(m, i, offset, v) \ + writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) #define I10NM_GET_SCK_MMIO_BASE(reg) (GET_BITFIELD(reg, 0, 28) << 23) #define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12) #define I10NM_GET_IMC_MMIO_SIZE(reg) ((GET_BITFIELD(reg, 13, 23) - \ GET_BITFIELD(reg, 0, 10) + 1) << 12) +#define RETRY_RD_ERR_LOG_UC BIT(1) +#define RETRY_RD_ERR_LOG_NOOVER BIT(14) +#define RETRY_RD_ERR_LOG_EN BIT(15) +#define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1)) +#define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0)) + static struct list_head *i10nm_edac_list; +static struct res_config *res_cfg; +static int retry_rd_err_log; + +static u32 offsets_scrub_icx[] = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_scrub_spr[] = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}; +static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0}; + +static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable) +{ + u32 s, d; + + if (!imc->mbase) + return; + + s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]); + d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]); + + if (enable) { + /* Save default configurations */ + imc->chan[chan].retry_rd_err_log_s = s; + imc->chan[chan].retry_rd_err_log_d = d; + + s &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + s |= RETRY_RD_ERR_LOG_EN; + d &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + d |= RETRY_RD_ERR_LOG_EN; + } else { + /* Restore default configurations */ + if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC) + s |= RETRY_RD_ERR_LOG_UC; + if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER) + s |= RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN)) + s &= ~RETRY_RD_ERR_LOG_EN; + if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC) + d |= RETRY_RD_ERR_LOG_UC; + if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER) + d |= RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN)) + d &= ~RETRY_RD_ERR_LOG_EN; + } + + I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s); + I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d); +} + +static void enable_retry_rd_err_log(bool enable) +{ + struct skx_dev *d; + int i, j; + + edac_dbg(2, "\n"); + + list_for_each_entry(d, i10nm_edac_list, list) + for (i = 0; i < I10NM_NUM_IMC; i++) + for (j = 0; j < I10NM_NUM_CHANNELS; j++) + __enable_retry_rd_err_log(&d->imc[i], j, enable); +} + +static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, + int len, bool scrub_err) +{ + struct skx_imc *imc = &res->dev->imc[res->imc]; + u32 log0, log1, log2, log3, log4; + u32 corr0, corr1, corr2, corr3; + u64 log2a, log5; + u32 *offsets; + int n; + + if (!imc->mbase) + return; + + offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand; + + log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]); + log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]); + log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]); + log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]); + log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]); + + if (res_cfg->type == SPR) { + log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]); + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]", + log0, log1, log2a, log3, log4, log5); + } else { + log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]); + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]", + log0, log1, log2, log3, log4, log5); + } + + corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18); + corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c); + corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20); + corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24); + + if (len - n > 0) + snprintf(msg + n, len - n, + " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]", + corr0 & 0xffff, corr0 >> 16, + corr1 & 0xffff, corr1 >> 16, + corr2 & 0xffff, corr2 >> 16, + corr3 & 0xffff, corr3 >> 16); + + /* Clear status bits */ + if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) { + log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; + I10NM_SET_REG32(imc, res->channel, offsets[0], log0); + } +} + static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, unsigned int dev, unsigned int fun) { @@ -132,6 +255,8 @@ static struct res_config i10nm_cfg0 = { .decs_did = 0x3452, .busno_cfg_offset = 0xcc, .ddr_chan_mmio_sz = 0x4000, + .offsets_scrub = offsets_scrub_icx, + .offsets_demand = offsets_demand_icx, }; static struct res_config i10nm_cfg1 = { @@ -139,6 +264,8 @@ static struct res_config i10nm_cfg1 = { .decs_did = 0x3452, .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x4000, + .offsets_scrub = offsets_scrub_icx, + .offsets_demand = offsets_demand_icx, }; static struct res_config spr_cfg = { @@ -147,6 +274,8 @@ static struct res_config spr_cfg = { .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x8000, .support_ddr5 = true, + .offsets_scrub = offsets_scrub_spr, + .offsets_demand = offsets_demand_spr, }; static const struct x86_cpu_id i10nm_cpuids[] = { @@ -286,6 +415,7 @@ static int __init i10nm_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + res_cfg = cfg; rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm); if (rc) @@ -339,6 +469,12 @@ static int __init i10nm_init(void) mce_register_decode_chain(&i10nm_mce_dec); setup_i10nm_debug(); + if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + skx_set_decode(NULL, show_retry_rd_err_log); + if (retry_rd_err_log == 2) + enable_retry_rd_err_log(true); + } + i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION); return 0; @@ -350,6 +486,13 @@ static int __init i10nm_init(void) static void __exit i10nm_exit(void) { edac_dbg(2, "\n"); + + if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + skx_set_decode(NULL, NULL); + if (retry_rd_err_log == 2) + enable_retry_rd_err_log(false); + } + teardown_i10nm_debug(); mce_unregister_decode_chain(&i10nm_mce_dec); skx_adxl_put(); @@ -359,5 +502,8 @@ static void __exit i10nm_exit(void) module_init(i10nm_init); module_exit(i10nm_exit); +module_param(retry_rd_err_log, int, 0444); +MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)"); + MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors"); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 4dbd46575bfb..1abc020d49ab 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -230,7 +230,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) #define SKX_ILV_TARGET(tgt) ((tgt) & 7) static void skx_show_retry_rd_err_log(struct decoded_addr *res, - char *msg, int len) + char *msg, int len, + bool scrub_err) { u32 log0, log1, log2, log3, log4; u32 corr0, corr1, corr2, corr3; diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 81c3e2ec6f56..05ffd39d6de9 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -494,6 +494,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); + bool scrub_err = false; bool recoverable; int len; u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52); @@ -545,6 +546,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, break; case 4: optype = "memory scrubbing error"; + scrub_err = true; break; default: optype = "reserved"; @@ -567,7 +569,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, } if (skx_show_retry_rd_err_log) - skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len); + skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err); edac_dbg(0, "%s\n", skx_msg); diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index bf56bebff138..22e174c740be 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -66,6 +66,8 @@ struct skx_dev { struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; + u32 retry_rd_err_log_s; + u32 retry_rd_err_log_d; struct skx_dimm { u8 close_pg; u8 bank_xor_enable; @@ -123,12 +125,15 @@ struct res_config { /* Per DDR channel memory-mapped I/O size */ int ddr_chan_mmio_sz; bool support_ddr5; + /* Offsets of retry_rd_err_log registers */ + u32 *offsets_scrub; + u32 *offsets_demand; }; typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, struct res_config *cfg); typedef bool (*skx_decode_f)(struct decoded_addr *res); -typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len); +typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err); int __init skx_adxl_get(void); void __exit skx_adxl_put(void); -- Gitee From f82dbd0e34a668a0fc38aa46107fb0f2866eb1d9 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 11 Jun 2021 10:01:18 -0700 Subject: [PATCH 011/132] EDAC/skx_common: Add new ADXL components for 2-level memory mainline inclusion from mainline-v5.14-rc1 commit 2f4348e5a86198704368a699a7c4cdeb21d569f5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 2f4348e5a861 EDAC/skx_common: Add new ADXL components for 2-level memory. Backport to add EDAC 2LM support. -------------------------------- Some Intel servers may configure memory in 2 levels, using fast "near" memory (e.g. DDR) as a cache for larger, slower, "far" memory (e.g. 3D X-point). In these configurations the BIOS ADXL address translation for an address in a 2-level memory range will provide details of both the "near" and far components. Current exported ADXL components are only for 1-level memory system or for 2nd level memory of 2-level memory system. So add new ADXL components for 1st level memory of 2-level memory system to fully support 2-level memory system and the detection of memory error source(1st level memory or 2nd level memory). Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210611170123.1057025-2-tony.luck@intel.com Signed-off-by: Youquan Song --- drivers/edac/skx_common.c | 67 ++++++++++++++++++++++++++++++++------- drivers/edac/skx_common.h | 11 +++++++ 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 05ffd39d6de9..3d0b391a400d 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -23,10 +23,13 @@ #include "skx_common.h" static const char * const component_names[] = { - [INDEX_SOCKET] = "ProcessorSocketId", - [INDEX_MEMCTRL] = "MemoryControllerId", - [INDEX_CHANNEL] = "ChannelId", - [INDEX_DIMM] = "DimmSlotId", + [INDEX_SOCKET] = "ProcessorSocketId", + [INDEX_MEMCTRL] = "MemoryControllerId", + [INDEX_CHANNEL] = "ChannelId", + [INDEX_DIMM] = "DimmSlotId", + [INDEX_NM_MEMCTRL] = "NmMemoryControllerId", + [INDEX_NM_CHANNEL] = "NmChannelId", + [INDEX_NM_DIMM] = "NmDimmSlotId", }; static int component_indices[ARRAY_SIZE(component_names)]; @@ -34,12 +37,14 @@ static int adxl_component_count; static const char * const *adxl_component_names; static u64 *adxl_values; static char *adxl_msg; +static unsigned long adxl_nm_bitmap; static char skx_msg[MSG_SIZE]; static skx_decode_f skx_decode; static skx_show_retry_log_f skx_show_retry_rd_err_log; static u64 skx_tolm, skx_tohm; static LIST_HEAD(dev_edac_list); +static bool skx_mem_cfg_2lm; int __init skx_adxl_get(void) { @@ -56,14 +61,25 @@ int __init skx_adxl_get(void) for (j = 0; names[j]; j++) { if (!strcmp(component_names[i], names[j])) { component_indices[i] = j; + + if (i >= INDEX_NM_FIRST) + adxl_nm_bitmap |= 1 << i; + break; } } - if (!names[j]) + if (!names[j] && i < INDEX_NM_FIRST) goto err; } + if (skx_mem_cfg_2lm) { + if (!adxl_nm_bitmap) + skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n"); + else + edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap); + } + adxl_component_names = names; while (*names++) adxl_component_count++; @@ -99,7 +115,7 @@ void __exit skx_adxl_put(void) kfree(adxl_msg); } -static bool skx_adxl_decode(struct decoded_addr *res) +static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem) { struct skx_dev *d; int i, len = 0; @@ -116,11 +132,20 @@ static bool skx_adxl_decode(struct decoded_addr *res) } res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; - res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; - res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; - res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + if (error_in_1st_level_mem) { + res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ? + (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1; + res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ? + (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1; + res->dimm = (adxl_nm_bitmap & BIT_NM_DIMM) ? + (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1; + } else { + res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; + res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; + res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + } - if (res->imc > NUM_IMC - 1) { + if (res->imc > NUM_IMC - 1 || res->imc < 0) { skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); return false; } @@ -151,6 +176,11 @@ static bool skx_adxl_decode(struct decoded_addr *res) return true; } +void skx_set_mem_cfg(bool mem_cfg_2lm) +{ + skx_mem_cfg_2lm = mem_cfg_2lm; +} + void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) { skx_decode = decode; @@ -580,6 +610,21 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, optype, skx_msg); } +static bool skx_error_in_1st_level_mem(const struct mce *m) +{ + u32 errcode; + + if (!skx_mem_cfg_2lm) + return false; + + errcode = GET_BITFIELD(m->status, 0, 15); + + if ((errcode & 0xef80) != 0x280) + return false; + + return true; +} + int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data) { @@ -599,7 +644,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, res.addr = mce->addr; if (adxl_component_count) { - if (!skx_adxl_decode(&res)) + if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))) return NOTIFY_DONE; } else if (!skx_decode || !skx_decode(&res)) { return NOTIFY_DONE; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 22e174c740be..de9f82977f0e 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -9,6 +9,8 @@ #ifndef _SKX_COMM_EDAC_H #define _SKX_COMM_EDAC_H +#include + #define MSG_SIZE 1024 /* @@ -94,9 +96,17 @@ enum { INDEX_MEMCTRL, INDEX_CHANNEL, INDEX_DIMM, + INDEX_NM_FIRST, + INDEX_NM_MEMCTRL = INDEX_NM_FIRST, + INDEX_NM_CHANNEL, + INDEX_NM_DIMM, INDEX_MAX }; +#define BIT_NM_MEMCTRL BIT_ULL(INDEX_NM_MEMCTRL) +#define BIT_NM_CHANNEL BIT_ULL(INDEX_NM_CHANNEL) +#define BIT_NM_DIMM BIT_ULL(INDEX_NM_DIMM) + struct decoded_addr { struct skx_dev *dev; u64 addr; @@ -138,6 +148,7 @@ typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int le int __init skx_adxl_get(void); void __exit skx_adxl_put(void); void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log); +void skx_set_mem_cfg(bool mem_cfg_2lm); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); int skx_get_node_id(struct skx_dev *d, u8 *id); -- Gitee From 6b81ee370e7b72e93ff4d3bef67c02ed4a35632b Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 11 Jun 2021 10:01:19 -0700 Subject: [PATCH 012/132] EDAC/i10nm: Add detection of memory levels for ICX/SPR servers mainline inclusion from mainline-v5.14-rc1 commit 4bd4d32e9a38d7ffb091b4109ab63c8f601e5678 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit 4bd4d32e9a38 EDAC/i10nm: Add detection of memory levels for ICX/SPR servers. Backport to add EDAC 2LM support. -------------------------------- Current i10nm_edac driver is only for system configured in 1-level memory. If the system is configured in 2-level memory, the driver doesn't report the 1st level memory DIMM for the error address, even if the error occurs in the 1st level memory. Both Ice Lake servers and Sapphire Rapids servers can be configured in 2-level memory. Add detection of memory levels to i10nm_edac for the two kinds of servers so that the driver can report the 2nd level memory DIMM or the 1st level memory DIMM according to error source. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210611170123.1057025-3-tony.luck@intel.com Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 39 +++++++++++++++++++++++++++++++++++++++ drivers/edac/skx_common.h | 3 +++ 2 files changed, 42 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 7798de1e2aed..4e2a4d8396da 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -24,6 +24,8 @@ pci_read_config_dword((d)->uracu, 0xd0, &(reg)) #define I10NM_GET_IMC_BAR(d, i, reg) \ pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) +#define I10NM_GET_SAD(d, offset, i, reg)\ + pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ @@ -50,6 +52,10 @@ #define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1)) #define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0)) +#define I10NM_MAX_SAD 16 +#define I10NM_SAD_ENABLE(reg) GET_BITFIELD(reg, 0, 0) +#define I10NM_SAD_NM_CACHEABLE(reg) GET_BITFIELD(reg, 5, 5) + static struct list_head *i10nm_edac_list; static struct res_config *res_cfg; @@ -186,6 +192,31 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, return pdev; } +static bool i10nm_check_2lm(struct res_config *cfg) +{ + struct skx_dev *d; + u32 reg; + int i; + + list_for_each_entry(d, i10nm_edac_list, list) { + d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1], + PCI_SLOT(cfg->sad_all_devfn), + PCI_FUNC(cfg->sad_all_devfn)); + if (!d->sad_all) + continue; + + for (i = 0; i < I10NM_MAX_SAD; i++) { + I10NM_GET_SAD(d, cfg->sad_all_offset, i, reg); + if (I10NM_SAD_ENABLE(reg) && I10NM_SAD_NM_CACHEABLE(reg)) { + edac_dbg(2, "2-level memory configuration.\n"); + return true; + } + } + } + + return false; +} + static int i10nm_get_all_munits(void) { struct pci_dev *mdev; @@ -255,6 +286,8 @@ static struct res_config i10nm_cfg0 = { .decs_did = 0x3452, .busno_cfg_offset = 0xcc, .ddr_chan_mmio_sz = 0x4000, + .sad_all_devfn = PCI_DEVFN(29, 0), + .sad_all_offset = 0x108, .offsets_scrub = offsets_scrub_icx, .offsets_demand = offsets_demand_icx, }; @@ -264,6 +297,8 @@ static struct res_config i10nm_cfg1 = { .decs_did = 0x3452, .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x4000, + .sad_all_devfn = PCI_DEVFN(29, 0), + .sad_all_offset = 0x108, .offsets_scrub = offsets_scrub_icx, .offsets_demand = offsets_demand_icx, }; @@ -274,6 +309,8 @@ static struct res_config spr_cfg = { .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x8000, .support_ddr5 = true, + .sad_all_devfn = PCI_DEVFN(10, 0), + .sad_all_offset = 0x300, .offsets_scrub = offsets_scrub_spr, .offsets_demand = offsets_demand_spr, }; @@ -429,6 +466,8 @@ static int __init i10nm_init(void) return -ENODEV; } + skx_set_mem_cfg(i10nm_check_2lm(cfg)); + rc = i10nm_get_all_munits(); if (rc < 0) goto fail; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index de9f82977f0e..1fb7540a7092 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -135,6 +135,9 @@ struct res_config { /* Per DDR channel memory-mapped I/O size */ int ddr_chan_mmio_sz; bool support_ddr5; + /* SAD device number and function number */ + unsigned int sad_all_devfn; + int sad_all_offset; /* Offsets of retry_rd_err_log registers */ u32 *offsets_scrub; u32 *offsets_demand; -- Gitee From 45875988308087e7fae3b15f217ae376765b126b Mon Sep 17 00:00:00 2001 From: root Date: Wed, 13 Jul 2022 15:28:12 +0800 Subject: [PATCH 013/132] EDAC/i10nm: Add support for high bandwidth memory mainline inclusion from mainline-v5.14-rc1 commit c945088384d00e6eb61535cc4ba25bc062090909 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit c945088384d0 EDAC/i10nm: Add support for high bandwidth memory. Backport to add EDAC HBM support. -------------------------------- A future Xeon processor will include in-package HBM (high bandwidth memory). The in-package HBM memory controller shares the same architecture with the regular DDR memory controller. Add the HBM memory controller devices for EDAC support. Tested-by: Hongyu Ning Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210611170123.1057025-4-tony.luck@intel.com Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 132 ++++++++++++++++++++++++++++++++++---- drivers/edac/skx_common.c | 15 +++-- drivers/edac/skx_common.h | 20 +++++- 3 files changed, 148 insertions(+), 19 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 4e2a4d8396da..47d6d7282c58 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -13,7 +13,7 @@ #include "edac_module.h" #include "skx_common.h" -#define I10NM_REVISION "v0.0.4" +#define I10NM_REVISION "v0.0.5" #define EDAC_MOD_STR "i10nm_edac" /* Debug macros */ @@ -26,14 +26,22 @@ pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) #define I10NM_GET_SAD(d, offset, i, reg)\ pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg)) +#define I10NM_GET_HBM_IMC_BAR(d, reg) \ + pci_read_config_dword((d)->uracu, 0xd4, &(reg)) +#define I10NM_GET_CAPID3_CFG(d, reg) \ + pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ - readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4) + readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \ + (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ - readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz) + readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \ + (i) * (m)->chan_mmio_sz) #define I10NM_GET_MCMTR(m, i) \ - readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz) + readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \ + (i) * (m)->chan_mmio_sz) #define I10NM_GET_AMAP(m, i) \ - readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz) + readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \ + (i) * (m)->chan_mmio_sz) #define I10NM_GET_REG32(m, i, offset) \ readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) #define I10NM_GET_REG64(m, i, offset) \ @@ -45,6 +53,12 @@ #define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12) #define I10NM_GET_IMC_MMIO_SIZE(reg) ((GET_BITFIELD(reg, 13, 23) - \ GET_BITFIELD(reg, 0, 10) + 1) << 12) +#define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg) \ + ((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000) + +#define I10NM_HBM_IMC_MMIO_SIZE 0x9000 +#define I10NM_IS_HBM_PRESENT(reg) GET_BITFIELD(reg, 27, 30) +#define I10NM_IS_HBM_IMC(reg) GET_BITFIELD(reg, 29, 29) #define RETRY_RD_ERR_LOG_UC BIT(1) #define RETRY_RD_ERR_LOG_NOOVER BIT(14) @@ -217,7 +231,7 @@ static bool i10nm_check_2lm(struct res_config *cfg) return false; } -static int i10nm_get_all_munits(void) +static int i10nm_get_ddr_munits(void) { struct pci_dev *mdev; void __iomem *mbase; @@ -245,7 +259,7 @@ static int i10nm_get_all_munits(void) edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n", j++, base, reg); - for (i = 0; i < I10NM_NUM_IMC; i++) { + for (i = 0; i < I10NM_NUM_DDR_IMC; i++) { mdev = pci_get_dev_wrapper(d->seg, d->bus[0], 12 + i, 0); if (i == 0 && !mdev) { @@ -281,6 +295,90 @@ static int i10nm_get_all_munits(void) return 0; } +static bool i10nm_check_hbm_imc(struct skx_dev *d) +{ + u32 reg; + + if (I10NM_GET_CAPID3_CFG(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get capid3_cfg\n"); + return false; + } + + return I10NM_IS_HBM_PRESENT(reg) != 0; +} + +static int i10nm_get_hbm_munits(void) +{ + struct pci_dev *mdev; + void __iomem *mbase; + u32 reg, off, mcmtr; + struct skx_dev *d; + int i, lmc; + u64 base; + + list_for_each_entry(d, i10nm_edac_list, list) { + d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3); + if (!d->pcu_cr3) + return -ENODEV; + + if (!i10nm_check_hbm_imc(d)) { + i10nm_printk(KERN_DEBUG, "No hbm memory\n"); + return -ENODEV; + } + + if (I10NM_GET_SCK_BAR(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get socket bar\n"); + return -ENODEV; + } + base = I10NM_GET_SCK_MMIO_BASE(reg); + + if (I10NM_GET_HBM_IMC_BAR(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get hbm mc bar\n"); + return -ENODEV; + } + base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg); + + lmc = I10NM_NUM_DDR_IMC; + + for (i = 0; i < I10NM_NUM_HBM_IMC; i++) { + mdev = pci_get_dev_wrapper(d->seg, d->bus[0], + 12 + i / 4, 1 + i % 4); + if (i == 0 && !mdev) { + i10nm_printk(KERN_ERR, "No hbm mc found\n"); + return -ENODEV; + } + if (!mdev) + continue; + + d->imc[lmc].mdev = mdev; + off = i * I10NM_HBM_IMC_MMIO_SIZE; + + edac_dbg(2, "hbm mc%d mmio base 0x%llx size 0x%x\n", + lmc, base + off, I10NM_HBM_IMC_MMIO_SIZE); + + mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE); + if (!mbase) { + i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n", + base + off); + return -ENOMEM; + } + + d->imc[lmc].mbase = mbase; + d->imc[lmc].hbm_mc = true; + + mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0); + if (!I10NM_IS_HBM_IMC(mcmtr)) { + i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n"); + return -ENODEV; + } + + lmc++; + } + } + + return 0; +} + static struct res_config i10nm_cfg0 = { .type = I10NM, .decs_did = 0x3452, @@ -308,6 +406,7 @@ static struct res_config spr_cfg = { .decs_did = 0x3252, .busno_cfg_offset = 0xd0, .ddr_chan_mmio_sz = 0x8000, + .hbm_chan_mmio_sz = 0x4000, .support_ddr5 = true, .sad_all_devfn = PCI_DEVFN(10, 0), .sad_all_offset = 0x300, @@ -345,14 +444,14 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, struct dimm_info *dimm; int i, j, ndimms; - for (i = 0; i < I10NM_NUM_CHANNELS; i++) { + for (i = 0; i < imc->num_channels; i++) { if (!imc->mbase) continue; ndimms = 0; mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); amap = I10NM_GET_AMAP(imc, i); - for (j = 0; j < I10NM_NUM_DIMMS; j++) { + for (j = 0; j < imc->num_dimms; j++) { dimm = edac_get_dimm(mci, i, j, 0); mtr = I10NM_GET_DIMMMTR(imc, i, j); edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n", @@ -468,8 +567,9 @@ static int __init i10nm_init(void) skx_set_mem_cfg(i10nm_check_2lm(cfg)); - rc = i10nm_get_all_munits(); - if (rc < 0) + rc = i10nm_get_ddr_munits(); + + if (i10nm_get_hbm_munits() && rc) goto fail; list_for_each_entry(d, i10nm_edac_list, list) { @@ -490,7 +590,15 @@ static int __init i10nm_init(void) d->imc[i].lmc = i; d->imc[i].src_id = src_id; d->imc[i].node_id = node_id; - d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz; + if (d->imc[i].hbm_mc) { + d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz; + d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS; + d->imc[i].num_dimms = I10NM_NUM_HBM_DIMMS; + } else { + d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz; + d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS; + d->imc[i].num_dimms = I10NM_NUM_DDR_DIMMS; + } rc = skx_register_mci(&d->imc[i], d->imc[i].mdev, "Intel_10nm Socket", EDAC_MOD_STR, diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 3d0b391a400d..0c133d32b777 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -343,9 +343,9 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, ranks = numrank(mtr); rows = numrow(mtr); - cols = numcol(mtr); + cols = imc->hbm_mc ? 6 : numcol(mtr); - if (cfg->support_ddr5 && (amap & 0x8)) { + if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) { banks = 32; mtype = MEM_DDR5; } else { @@ -374,8 +374,13 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, dimm->dtype = get_width(mtr); dimm->mtype = mtype; dimm->edac_mode = EDAC_SECDED; /* likely better than this */ - snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", - imc->src_id, imc->lmc, chan, dimmno); + + if (imc->hbm_mc) + snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u", + imc->src_id, imc->lmc, chan); + else + snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", + imc->src_id, imc->lmc, chan, dimmno); return 1; } @@ -705,6 +710,8 @@ void skx_remove(void) } if (d->util_all) pci_dev_put(d->util_all); + if (d->pcu_cr3) + pci_dev_put(d->pcu_cr3); if (d->sad_all) pci_dev_put(d->sad_all); if (d->uracu) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 1fb7540a7092..03ac067a80b9 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -32,9 +32,17 @@ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ -#define I10NM_NUM_IMC 4 -#define I10NM_NUM_CHANNELS 2 -#define I10NM_NUM_DIMMS 2 +#define I10NM_NUM_DDR_IMC 4 +#define I10NM_NUM_DDR_CHANNELS 2 +#define I10NM_NUM_DDR_DIMMS 2 + +#define I10NM_NUM_HBM_IMC 16 +#define I10NM_NUM_HBM_CHANNELS 2 +#define I10NM_NUM_HBM_DIMMS 1 + +#define I10NM_NUM_IMC (I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC) +#define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) +#define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) @@ -56,12 +64,16 @@ struct skx_dev { struct pci_dev *sad_all; struct pci_dev *util_all; struct pci_dev *uracu; /* for i10nm CPU */ + struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ void __iomem *mbase; /* for i10nm CPU */ int chan_mmio_sz; /* for i10nm CPU */ + int num_channels; /* channels per memory controller */ + int num_dimms; /* dimms per channel */ + bool hbm_mc; u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id, node_id; @@ -134,6 +146,8 @@ struct res_config { int busno_cfg_offset; /* Per DDR channel memory-mapped I/O size */ int ddr_chan_mmio_sz; + /* Per HBM channel memory-mapped I/O size */ + int hbm_chan_mmio_sz; bool support_ddr5; /* SAD device number and function number */ unsigned int sad_all_devfn; -- Gitee From b0adb31550986f669e45c7f42f13876451a91d0e Mon Sep 17 00:00:00 2001 From: Naveen Krishna Chatradhi Date: Wed, 30 Jun 2021 20:58:24 +0530 Subject: [PATCH 014/132] EDAC/mc: Add new HBM2 memory type mainline inclusion from mainline-v5.13 commit e1ca90b7cc5cb5d3a38321cbb65ad36a59fcb574 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit e1ca90b7cc5c EDAC/mc: Add new HBM2 memory type. Backport to add EDAC HBM support. -------------------------------- Add a new entry to 'enum mem_type' and a new string to 'edac_mem_types[]' for HBM2 (High Bandwidth Memory Gen 2) new memory type. Reviewed-by: Yazen Ghannam Signed-off-by: Muralidhara M K Signed-off-by: Naveen Krishna Chatradhi Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210630152828.162659-4-nchatrad@amd.com Signed-off-by: Youquan Song --- drivers/edac/edac_mc.c | 1 + include/linux/edac.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 34514c638f19..bf4297075c22 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -163,6 +163,7 @@ const char * const edac_mem_types[] = { [MEM_LRDDR4] = "Load-Reduced-DDR4-RAM", [MEM_DDR5] = "Unbuffered-DDR5", [MEM_NVDIMM] = "Non-volatile-RAM", + [MEM_HBM2] = "High-bandwidth-memory-Gen2", }; EXPORT_SYMBOL_GPL(edac_mem_types); diff --git a/include/linux/edac.h b/include/linux/edac.h index 6c4565cc6273..490134495008 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -181,6 +181,7 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_LRDDR4: Load-Reduced DDR4 memory. * @MEM_DDR5: Unbuffered DDR5 RAM * @MEM_NVDIMM: Non-volatile RAM + * @MEM_HBM2: High bandwidth Memory Gen 2. */ enum mem_type { MEM_EMPTY = 0, @@ -206,6 +207,7 @@ enum mem_type { MEM_LRDDR4, MEM_DDR5, MEM_NVDIMM, + MEM_HBM2, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -230,6 +232,7 @@ enum mem_type { #define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) #define MEM_FLAG_DDR5 BIT(MEM_DDR5) #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) +#define MEM_FLAG_HBM2 BIT(MEM_HBM2) /** * enum edac-type - Error Detection and Correction capabilities and mode -- Gitee From 4a111addf4c90fc79db41b634b5b5b8b788d0f09 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 20 Jul 2021 09:30:09 -0700 Subject: [PATCH 015/132] EDAC/skx_common: Set the memory type correctly for HBM memory mainline inclusion from mainline-v5.15-rc1 commit fd07a4a0d30b5468a1f4a0739e34f5f014df7d44 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit fd07a4a0d30b EDAC/skx_common: Set the memory type correctly for HBM memory. Backport to add EDAC HBM support. -------------------------------- Set the memory type to MEM_HBM2 if it's managed by the HBM2 memory controller. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20210720163009.GA1417532@agluck-desk2.amr.corp.intel.com Signed-off-by: Youquan Song --- drivers/edac/skx_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 0c133d32b777..19c17c5198c5 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -345,7 +345,10 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, rows = numrow(mtr); cols = imc->hbm_mc ? 6 : numcol(mtr); - if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) { + if (imc->hbm_mc) { + banks = 32; + mtype = MEM_HBM2; + } else if (cfg->support_ddr5 && (amap & 0x8)) { banks = 32; mtype = MEM_DDR5; } else { -- Gitee From 028e1264e604bafb90f67c5533b2ad86b0cc8ea1 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 24 Dec 2021 04:11:26 -0500 Subject: [PATCH 016/132] EDAC/i10nm: Release mdev/mbase when failing to detect HBM mainline inclusion from mainline-v5.16 commit c370baa328022cbd46c59c821d1b467a97f047be category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I5HAC1 CVE: NA Intel-SIG: commit c370baa32802 EDAC/i10nm: Release mdev/mbase when failing to detect HBM. Backport to add EDAC HBM support. -------------------------------- On systems without HBM (High Bandwidth Memory) mdev/mbase are not released/unmapped. Add the code to release mdev/mbase when failing to detect HBM. [Tony: re-word commit message] Cc: Fixes: c945088384d0 ("EDAC/i10nm: Add support for high bandwidth memory") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20211224091126.1246-1-qiuxu.zhuo@intel.com Signed-off-by: Youquan Song --- drivers/edac/i10nm_base.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 47d6d7282c58..d63ddc9c994d 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -358,6 +358,9 @@ static int i10nm_get_hbm_munits(void) mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE); if (!mbase) { + pci_dev_put(d->imc[lmc].mdev); + d->imc[lmc].mdev = NULL; + i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n", base + off); return -ENOMEM; @@ -368,6 +371,12 @@ static int i10nm_get_hbm_munits(void) mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0); if (!I10NM_IS_HBM_IMC(mcmtr)) { + iounmap(d->imc[lmc].mbase); + d->imc[lmc].mbase = NULL; + d->imc[lmc].hbm_mc = false; + pci_dev_put(d->imc[lmc].mdev); + d->imc[lmc].mdev = NULL; + i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n"); return -ENODEV; } -- Gitee From 910f3dd810c9bab8332d53f2c32c55f5fe65a64c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 26 Jul 2022 17:38:14 +0800 Subject: [PATCH 017/132] etherdevice: Adjust ether_addr* prototypes to silence -Wstringop-overead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit 08ad7a770efacfecf903143f8de88d1e351a1f2d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=08ad7a770efacfecf903143f8de88d1e351a1f2d -------------------------------- commit 2618a0dae09ef37728dab89ff60418cbe25ae6bd upstream. With GCC 12, -Wstringop-overread was warning about an implicit cast from char[6] to char[8]. However, the extra 2 bytes are always thrown away, alignment doesn't matter, and the risk of hitting the edge of unallocated memory has been accepted, so this prototype can just be converted to a regular char *. Silences: net/core/dev.c: In function ‘bpf_prog_run_generic_xdp’: net/core/dev.c:4618:21: warning: ‘ether_addr_equal_64bits’ reading 8 bytes from a region of size 6 [-Wstringop-overread] 4618 | orig_host = ether_addr_equal_64bits(eth->h_dest, > skb->dev->dev_addr); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/dev.c:4618:21: note: referencing argument 1 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} net/core/dev.c:4618:21: note: referencing argument 2 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} In file included from net/core/dev.c:91: include/linux/etherdevice.h:375:20: note: in a call to function ‘ether_addr_equal_64bits’ 375 | static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], | ^~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Marc Kleine-Budde Tested-by: Marc Kleine-Budde Link: https://lore.kernel.org/netdev/20220212090811.uuzk6d76agw2vv73@pengutronix.de Cc: Jakub Kicinski Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller Cc: Khem Raj Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- include/linux/etherdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2e5debc0373c..99209f50915f 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -127,7 +127,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr) #endif } -static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN @@ -352,8 +352,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ -static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], - const u8 addr2[6+2]) +static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); -- Gitee From 0ccd441a7641338e1220f8a244da9159b653ff5b Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Tue, 26 Jul 2022 17:38:15 +0800 Subject: [PATCH 018/132] mm: page_alloc: fix building error on -Werror=array-compare stable inclusion from stable-v5.10.113 commit 69848f9488bc1088bcd9a73987dff8d2cb47a060 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=69848f9488bc1088bcd9a73987dff8d2cb47a060 -------------------------------- commit ca831f29f8f25c97182e726429b38c0802200c8f upstream. Arthur Marsh reported we would hit the error below when building kernel with gcc-12: CC mm/page_alloc.o mm/page_alloc.c: In function `mem_init_print_info': mm/page_alloc.c:8173:27: error: comparison between two arrays [-Werror=array-compare] 8173 | if (start <= pos && pos < end && size > adj) \ | In C++20, the comparision between arrays should be warned. Link: https://lkml.kernel.org/r/20211125130928.32465-1-sxwjean@me.com Signed-off-by: Xiongwei Song Reported-by: Arthur Marsh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Khem Raj Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ec73cca1726c..cf9c69d631f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7974,7 +7974,7 @@ void __init mem_init_print_info(const char *str) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) -- Gitee From d0005c0a490428fb4f12a085d77c1bb34bf1dbf5 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 26 Jul 2022 17:38:16 +0800 Subject: [PATCH 019/132] tracing: Dump stacktrace trigger to the corresponding instance mainline inclusion from mainline-v5.17-rc6 commit ce33c845b030c9cf768370c951bc699470b09fa7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ce33c845b030c9cf768370c951bc699470b09fa7 -------------------------------- The stacktrace event trigger is not dumping the stacktrace to the instance where it was enabled, but to the global "instance." Use the private_data, pointing to the trigger file, to figure out the corresponding trace instance, and use it in the trigger action, like snapshot_trigger does. Link: https://lkml.kernel.org/r/afbb0b4f18ba92c276865bc97204d438473f4ebc.1645396236.git.bristot@kernel.org Cc: stable@vger.kernel.org Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables") Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) [zzk: __trace_stack() was changed due to the merge of "36590c50b2d0 tracing: Merge irqflags + preempt counter", cherry-pick from mainline ce33c845b030c9 instead of stable 5.10.y] Signed-off-by: Zheng Zengkai Reviewed-by: Yang Jihong Acked-by: Xie XiuQi --- kernel/trace/trace_events_trigger.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d0309de2f84f..3c6229f16e81 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1219,7 +1219,12 @@ static void stacktrace_trigger(struct event_trigger_data *data, void *rec, struct ring_buffer_event *event) { - trace_dump_stack(STACK_SKIP); + struct trace_event_file *file = data->private_data; + + if (file) + __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + else + trace_dump_stack(STACK_SKIP); } static void -- Gitee From 32af01826a81a3f51b3e13d211351569f762f6c2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 26 Jul 2022 17:38:17 +0800 Subject: [PATCH 020/132] perf tools: Fix segfault accessing sample_id xyarray stable inclusion from stable-v5.10.113 commit 378061c9b886994fa045186390d61a5e7c696ae3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=378061c9b886994fa045186390d61a5e7c696ae3 -------------------------------- commit a668cc07f990d2ed19424d5c1a529521a9d1cee1 upstream. perf_evsel::sample_id is an xyarray which can cause a segfault when accessed beyond its size. e.g. # perf record -e intel_pt// -C 1 sleep 1 Segmentation fault (core dumped) # That is happening because a dummy event is opened to capture text poke events accross all CPUs, however the mmap logic is allocating according to the number of user_requested_cpus. In general, perf sometimes uses the evsel cpus to open events, and sometimes the evlist user_requested_cpus. However, it is not necessary to determine which case is which because the opened event file descriptors are also in an xyarray, the size of whch can be used to correctly allocate the size of the sample_id xyarray, because there is one ID per file descriptor. Note, in the affected code path, perf_evsel fd array is subsequently used to get the file descriptor for the mmap, so it makes sense for the xyarrays to be the same size there. Fixes: d1a177595b3a824c ("libperf: Adopt perf_evlist__mmap()/munmap() from tools/perf") Fixes: 246eba8e9041c477 ("perf tools: Add support for PERF_RECORD_TEXT_POKE") Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org # 5.5+ Link: https://lore.kernel.org/r/20220413114232.26914-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- tools/lib/perf/evlist.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 17465d454a0e..f76b1a9d5a6e 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -571,7 +571,6 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, { struct perf_evsel *evsel; const struct perf_cpu_map *cpus = evlist->cpus; - const struct perf_thread_map *threads = evlist->threads; if (!ops || !ops->get || !ops->mmap) return -EINVAL; @@ -583,7 +582,7 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, perf_evlist__for_each_entry(evlist, evsel) { if ((evsel->attr.read_format & PERF_FORMAT_ID) && evsel->sample_id == NULL && - perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0) + perf_evsel__alloc_id(evsel, evsel->fd->max_x, evsel->fd->max_y) < 0) return -ENOMEM; } -- Gitee From 9f91d21694fddc986c0beb30b2680d69457e66ec Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 26 Jul 2022 17:38:18 +0800 Subject: [PATCH 021/132] gfs2: assign rgrp glock before compute_bitstructs stable inclusion from stable-v5.10.113 commit 04dd45d9776eb4802df8b0a0a6fcdf27842fbdd1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=04dd45d9776eb4802df8b0a0a6fcdf27842fbdd1 -------------------------------- commit 428f651cb80b227af47fc302e4931791f2fb4741 upstream. Before this patch, function read_rindex_entry called compute_bitstructs before it allocated a glock for the rgrp. But if compute_bitstructs found a problem with the rgrp, it called gfs2_consist_rgrpd, and that called gfs2_dump_glock for rgd->rd_gl which had not yet been assigned. read_rindex_entry compute_bitstructs gfs2_consist_rgrpd gfs2_dump_glock <---------rgd->rd_gl was not set. This patch changes read_rindex_entry so it assigns an rgrp glock before calling compute_bitstructs so gfs2_dump_glock does not reference an unassigned pointer. If an error is discovered, the glock must also be put, so a new goto and label were added. Reported-by: syzbot+c6fd14145e2f62ca0784@syzkaller.appspotmail.com Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/gfs2/rgrp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index dc55b029afaa..c5bde789a16d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -906,15 +906,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); spin_lock_init(&rgd->rd_rsspin); - error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail; + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -928,6 +928,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) } error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl); fail: -- Gitee From 302f6418e23bbcecc28eafb2b9b50b3fd5de7806 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 26 Jul 2022 17:38:19 +0800 Subject: [PATCH 022/132] ALSA: usb-audio: Clear MIDI port active flag after draining stable inclusion from stable-v5.10.113 commit 8ce3820fc9d496100fb39008a952eebfb695b060 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8ce3820fc9d496100fb39008a952eebfb695b060 -------------------------------- commit 0665886ad1392e6b5bae85d7a6ccbed48dca1522 upstream. When a rawmidi output stream is closed, it calls the drain at first, then does trigger-off only when the drain returns -ERESTARTSYS as a fallback. It implies that each driver should turn off the stream properly after the drain. Meanwhile, USB-audio MIDI interface didn't change the port->active flag after the drain. This may leave the output work picking up the port that is closed right now, which eventually leads to a use-after-free for the already released rawmidi object. This patch fixes the bug by properly clearing the port->active flag after the output drain. Reported-by: syzbot+70e777a39907d6d5fd0a@syzkaller.appspotmail.com Cc: Link: https://lore.kernel.org/r/00000000000011555605dceaff03@google.com Link: https://lore.kernel.org/r/20220420130247.22062-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/usb/midi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/midi.c b/sound/usb/midi.c index fa91290ad89d..84676a8fb60d 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1210,6 +1210,7 @@ static void snd_usbmidi_output_drain(struct snd_rawmidi_substream *substream) } while (drain_urbs && timeout); finish_wait(&ep->drain_wait, &wait); } + port->active = 0; spin_unlock_irq(&ep->buffer_lock); } -- Gitee From 6942c4a9bb82dd73e524481428c69ffad9c9a445 Mon Sep 17 00:00:00 2001 From: Tim Crawford Date: Tue, 26 Jul 2022 17:38:20 +0800 Subject: [PATCH 023/132] ALSA: hda/realtek: Add quirk for Clevo NP70PNP stable inclusion from stable-v5.10.113 commit cf9b19546494b2e5df87486752946ae71c6775db category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf9b19546494b2e5df87486752946ae71c6775db -------------------------------- commit 86222af07abf1f5f07a5873cc399c29ab8a9b8b8 upstream. Fixes headset detection on Clevo NP70PNP. Signed-off-by: Tim Crawford Cc: Link: https://lore.kernel.org/r/20220421170412.3697-1-tcrawford@system76.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 11d653190e6e..b5168959fcf6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8897,6 +8897,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x8562, "Clevo NH[5|7][0-9]RZ[Q]", ALC269_FIXUP_DMIC), SND_PCI_QUIRK(0x1558, 0x8668, "Clevo NP50B[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x866d, "Clevo NP5[05]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x867c, "Clevo NP7[01]PNP", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x867d, "Clevo NP7[01]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8680, "Clevo NJ50LU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8686, "Clevo NH50[CZ]U", ALC256_FIXUP_MIC_NO_PRESENCE_AND_RESUME), -- Gitee From 83ab10a7a9ff529b76e8517e74380568f7b81797 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 26 Jul 2022 17:38:21 +0800 Subject: [PATCH 024/132] ASoC: atmel: Remove system clock tree configuration for at91sam9g20ek stable inclusion from stable-v5.10.113 commit 608fc58858bfa7552a9824c2f0e4a3ab8dd4efaa category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=608fc58858bfa7552a9824c2f0e4a3ab8dd4efaa -------------------------------- [ Upstream commit c775cbf62ed4911e4f0f23880f01815753123690 ] The MCLK of the WM8731 on the AT91SAM9G20-EK board is connected to the PCK0 output of the SoC, intended in the reference software to be supplied using PLLB and programmed to 12MHz. As originally written for use with a board file the audio driver was responsible for configuring the entire tree but in the conversion to the common clock framework the registration of the named pck0 and pllb clocks was removed so the driver has failed to instantiate ever since. Since the WM8731 driver has had support for managing a MCLK provided via the common clock framework for some time we can simply drop all the clock management code from the machine driver other than configuration of the sysclk rate, the CODEC driver still respects that configuration from the machine driver. Fixes: ff78a189b0ae55f ("ARM: at91: remove old at91-specific clock driver") Signed-off-by: Mark Brown Reviewed-by: Codrin Ciubotariu Link: https://lore.kernel.org/r/20220325154241.1600757-2-broonie@kernel.org Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/soc/atmel/sam9g20_wm8731.c | 61 -------------------------------- 1 file changed, 61 deletions(-) diff --git a/sound/soc/atmel/sam9g20_wm8731.c b/sound/soc/atmel/sam9g20_wm8731.c index 8a55d59a6c2a..d243de5f23dc 100644 --- a/sound/soc/atmel/sam9g20_wm8731.c +++ b/sound/soc/atmel/sam9g20_wm8731.c @@ -46,35 +46,6 @@ */ #undef ENABLE_MIC_INPUT -static struct clk *mclk; - -static int at91sam9g20ek_set_bias_level(struct snd_soc_card *card, - struct snd_soc_dapm_context *dapm, - enum snd_soc_bias_level level) -{ - static int mclk_on; - int ret = 0; - - switch (level) { - case SND_SOC_BIAS_ON: - case SND_SOC_BIAS_PREPARE: - if (!mclk_on) - ret = clk_enable(mclk); - if (ret == 0) - mclk_on = 1; - break; - - case SND_SOC_BIAS_OFF: - case SND_SOC_BIAS_STANDBY: - if (mclk_on) - clk_disable(mclk); - mclk_on = 0; - break; - } - - return ret; -} - static const struct snd_soc_dapm_widget at91sam9g20ek_dapm_widgets[] = { SND_SOC_DAPM_MIC("Int Mic", NULL), SND_SOC_DAPM_SPK("Ext Spk", NULL), @@ -135,7 +106,6 @@ static struct snd_soc_card snd_soc_at91sam9g20ek = { .owner = THIS_MODULE, .dai_link = &at91sam9g20ek_dai, .num_links = 1, - .set_bias_level = at91sam9g20ek_set_bias_level, .dapm_widgets = at91sam9g20ek_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(at91sam9g20ek_dapm_widgets), @@ -148,7 +118,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct device_node *codec_np, *cpu_np; - struct clk *pllb; struct snd_soc_card *card = &snd_soc_at91sam9g20ek; int ret; @@ -162,31 +131,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) return -EINVAL; } - /* - * Codec MCLK is supplied by PCK0 - set it up. - */ - mclk = clk_get(NULL, "pck0"); - if (IS_ERR(mclk)) { - dev_err(&pdev->dev, "Failed to get MCLK\n"); - ret = PTR_ERR(mclk); - goto err; - } - - pllb = clk_get(NULL, "pllb"); - if (IS_ERR(pllb)) { - dev_err(&pdev->dev, "Failed to get PLLB\n"); - ret = PTR_ERR(pllb); - goto err_mclk; - } - ret = clk_set_parent(mclk, pllb); - clk_put(pllb); - if (ret != 0) { - dev_err(&pdev->dev, "Failed to set MCLK parent\n"); - goto err_mclk; - } - - clk_set_rate(mclk, MCLK_RATE); - card->dev = &pdev->dev; /* Parse device node info */ @@ -230,9 +174,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) return ret; -err_mclk: - clk_put(mclk); - mclk = NULL; err: atmel_ssc_put_audio(0); return ret; @@ -242,8 +183,6 @@ static int at91sam9g20ek_audio_remove(struct platform_device *pdev) { struct snd_soc_card *card = platform_get_drvdata(pdev); - clk_disable(mclk); - mclk = NULL; snd_soc_unregister_card(card); atmel_ssc_put_audio(0); -- Gitee From f7ec810a47d7f46e7b1739befe01d4850fd7ca31 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 26 Jul 2022 17:38:22 +0800 Subject: [PATCH 025/132] ASoC: msm8916-wcd-digital: Check failure for devm_snd_soc_register_component stable inclusion from stable-v5.10.113 commit b6f474cd3097b9570d5c2c30075ca3506fbbcf33 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b6f474cd3097b9570d5c2c30075ca3506fbbcf33 -------------------------------- [ Upstream commit e927b05f3cc20de87f6b7d912a5bbe556931caca ] devm_snd_soc_register_component() may fails, we should check the error and do the corresponding error handling. Fixes: 150db8c5afa1 ("ASoC: codecs: Add msm8916-wcd digital codec") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220403115239.30140-1-linmq006@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/soc/codecs/msm8916-wcd-digital.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c index 9ad7fc0baf07..20a07c92b2fc 100644 --- a/sound/soc/codecs/msm8916-wcd-digital.c +++ b/sound/soc/codecs/msm8916-wcd-digital.c @@ -1206,9 +1206,16 @@ static int msm8916_wcd_digital_probe(struct platform_device *pdev) dev_set_drvdata(dev, priv); - return devm_snd_soc_register_component(dev, &msm8916_wcd_digital, + ret = devm_snd_soc_register_component(dev, &msm8916_wcd_digital, msm8916_wcd_digital_dai, ARRAY_SIZE(msm8916_wcd_digital_dai)); + if (ret) + goto err_mclk; + + return 0; + +err_mclk: + clk_disable_unprepare(priv->mclk); err_clk: clk_disable_unprepare(priv->ahbclk); return ret; -- Gitee From ecf9eaf316dc293ccd09a729f924840b09fe60db Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 26 Jul 2022 17:38:23 +0800 Subject: [PATCH 026/132] ASoC: codecs: wcd934x: do not switch off SIDO Buck when codec is in use stable inclusion from stable-v5.10.113 commit 12aa8021c7a72811cd8096b1e456cd5d265896d9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=12aa8021c7a72811cd8096b1e456cd5d265896d9 -------------------------------- [ Upstream commit db6dd1bee63d1d88fbddfe07af800af5948ac28e ] SIDO(Single-Inductor Dual-Ouput) Buck powers up both analog and digital circuits along with internal memory, powering off this is the last thing that codec should do when going to very low power. Current code was powering off this Buck if there are no users of sysclk, which is not correct. Powering off this buck will result in no register access. This code path was never tested until recently after adding pm support in SoundWire controller. Fix this by removing the buck poweroff when the codec is active and also the code that is not used. Without this patch all the read/write transactions will never complete and results in SLIMBus Errors like: qcom,slim-ngd qcom,slim-ngd.1: Tx:MT:0x0, MC:0x60, LA:0xcf failed:-110 wcd934x-codec wcd934x-codec.1.auto: ASoC: error at soc_component_read_no_lock on wcd934x-codec.1.auto for register: [0x00000d05] -110 qcom,slim-ngd-ctrl 171c0000.slim: Error Interrupt received 0x82000000 Reported-by: Amit Pundir Fixes: a61f3b4f476e ("ASoC: wcd934x: add support to wcd9340/wcd9341 codec") Signed-off-by: Srinivas Kandagatla Tested-by: Amit Pundir Link: https://lore.kernel.org/r/20220407094313.2880-1-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/soc/codecs/wcd934x.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c index 8540ac230d0e..fd704df9b175 100644 --- a/sound/soc/codecs/wcd934x.c +++ b/sound/soc/codecs/wcd934x.c @@ -1188,29 +1188,7 @@ static int wcd934x_set_sido_input_src(struct wcd934x_codec *wcd, int sido_src) if (sido_src == wcd->sido_input_src) return 0; - if (sido_src == SIDO_SOURCE_INTERNAL) { - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, 0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_PRE_ENX_MASK, 0x0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO, - WCD934X_ANA_RCO_BG_EN_MASK, 0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_PRE_EN1_MASK, - WCD934X_ANA_BUCK_PRE_EN1_ENABLE); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_PRE_EN2_MASK, - WCD934X_ANA_BUCK_PRE_EN2_ENABLE); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, - WCD934X_ANA_BUCK_HI_ACCU_ENABLE); - usleep_range(100, 110); - } else if (sido_src == SIDO_SOURCE_RCO_BG) { + if (sido_src == SIDO_SOURCE_RCO_BG) { regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO, WCD934X_ANA_RCO_BG_EN_MASK, WCD934X_ANA_RCO_BG_ENABLE); @@ -1296,8 +1274,6 @@ static int wcd934x_disable_ana_bias_and_syclk(struct wcd934x_codec *wcd) regmap_update_bits(wcd->regmap, WCD934X_CLK_SYS_MCLK_PRG, WCD934X_EXT_CLK_BUF_EN_MASK | WCD934X_MCLK_EN_MASK, 0x0); - wcd934x_set_sido_input_src(wcd, SIDO_SOURCE_INTERNAL); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS, WCD934X_ANA_BIAS_EN_MASK, 0); regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS, -- Gitee From 4be6d449af6b349bdeeb388d9a40f814d8120112 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 26 Jul 2022 17:38:24 +0800 Subject: [PATCH 027/132] dmaengine: imx-sdma: Fix error checking in sdma_event_remap stable inclusion from stable-v5.10.113 commit 9bc949a181ba805b76b81d1ae1519af093328b81 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9bc949a181ba805b76b81d1ae1519af093328b81 -------------------------------- [ Upstream commit 7104b9cb35a33ad803a1adbbfa50569b008faf15 ] of_parse_phandle() returns NULL on errors, rather than error pointers. Using NULL check on grp_np to fix this. Fixes: d078cd1b4185 ("dmaengine: imx-sdma: Add imx6sx platform support") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220308064952.15743-1-linmq006@gmail.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/imx-sdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 306f93e4b26a..792c91cd1608 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -1789,7 +1789,7 @@ static int sdma_event_remap(struct sdma_engine *sdma) u32 reg, val, shift, num_map, i; int ret = 0; - if (IS_ERR(np) || IS_ERR(gpr_np)) + if (IS_ERR(np) || !gpr_np) goto out; event_remap = of_find_property(np, propname, NULL); @@ -1837,7 +1837,7 @@ static int sdma_event_remap(struct sdma_engine *sdma) } out: - if (!IS_ERR(gpr_np)) + if (gpr_np) of_node_put(gpr_np); return ret; -- Gitee From dfc6a02d0e0f49da209a4e3fce9e1e52775ae7a8 Mon Sep 17 00:00:00 2001 From: zhangqilong Date: Tue, 26 Jul 2022 17:38:25 +0800 Subject: [PATCH 028/132] dmaengine: mediatek:Fix PM usage reference leak of mtk_uart_apdma_alloc_chan_resources stable inclusion from stable-v5.10.113 commit f714abf28f819849d2fd93a4a8db15cff3f8798e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f714abf28f819849d2fd93a4a8db15cff3f8798e -------------------------------- [ Upstream commit 545b2baac89b859180e51215468c05d85ea8465a ] pm_runtime_get_sync will increment pm usage counter even it failed. Forgetting to putting operation will result in reference leak here. We fix it: 1) Replacing it with pm_runtime_resume_and_get to keep usage counter balanced. 2) Add putting operation before returning error. Fixes:9135408c3ace4 ("dmaengine: mediatek: Add MediaTek UART APDMA support") Signed-off-by: Zhang Qilong Link: https://lore.kernel.org/r/20220319022142.142709-1-zhangqilong3@huawei.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/mediatek/mtk-uart-apdma.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/dma/mediatek/mtk-uart-apdma.c b/drivers/dma/mediatek/mtk-uart-apdma.c index 375e7e647df6..a1517ef1f4a0 100644 --- a/drivers/dma/mediatek/mtk-uart-apdma.c +++ b/drivers/dma/mediatek/mtk-uart-apdma.c @@ -274,7 +274,7 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan) unsigned int status; int ret; - ret = pm_runtime_get_sync(mtkd->ddev.dev); + ret = pm_runtime_resume_and_get(mtkd->ddev.dev); if (ret < 0) { pm_runtime_put_noidle(chan->device->dev); return ret; @@ -288,18 +288,21 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan) ret = readx_poll_timeout(readl, c->base + VFF_EN, status, !status, 10, 100); if (ret) - return ret; + goto err_pm; ret = request_irq(c->irq, mtk_uart_apdma_irq_handler, IRQF_TRIGGER_NONE, KBUILD_MODNAME, chan); if (ret < 0) { dev_err(chan->device->dev, "Can't request dma IRQ\n"); - return -EINVAL; + ret = -EINVAL; + goto err_pm; } if (mtkd->support_33bits) mtk_uart_apdma_write(c, VFF_4G_SUPPORT, VFF_4G_SUPPORT_CLR_B); +err_pm: + pm_runtime_put_noidle(mtkd->ddev.dev); return ret; } -- Gitee From 4ab150b4794d5ccbae344f03323fce6d73d66fcd Mon Sep 17 00:00:00 2001 From: Allen-KH Cheng Date: Tue, 26 Jul 2022 17:38:26 +0800 Subject: [PATCH 029/132] spi: spi-mtk-nor: initialize spi controller after resume stable inclusion from stable-v5.10.113 commit 3f7914dbeacdffe4fa198dee7afefa74998b5d8d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3f7914dbeacdffe4fa198dee7afefa74998b5d8d -------------------------------- [ Upstream commit 317c2045618cc1f8d38beb8c93a7bdb6ad8638c6 ] After system resumes, the registers of nor controller are initialized with default values. The nor controller will not function properly. To handle both issues above, we add mtk_nor_init() in mtk_nor_resume after pm_runtime_force_resume(). Fixes: 3bfd9103c7af ("spi: spi-mtk-nor: Add power management support") Signed-off-by: Allen-KH Cheng Reviewed-by: Rex-BC Chen Link: https://lore.kernel.org/r/20220412115743.22641-1-allen-kh.cheng@mediatek.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/spi/spi-mtk-nor.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-mtk-nor.c b/drivers/spi/spi-mtk-nor.c index 288f6c2bbd57..106e3cacba4c 100644 --- a/drivers/spi/spi-mtk-nor.c +++ b/drivers/spi/spi-mtk-nor.c @@ -895,7 +895,17 @@ static int __maybe_unused mtk_nor_suspend(struct device *dev) static int __maybe_unused mtk_nor_resume(struct device *dev) { - return pm_runtime_force_resume(dev); + struct spi_controller *ctlr = dev_get_drvdata(dev); + struct mtk_nor *sp = spi_controller_get_devdata(ctlr); + int ret; + + ret = pm_runtime_force_resume(dev); + if (ret) + return ret; + + mtk_nor_init(sp); + + return 0; } static const struct dev_pm_ops mtk_nor_pm_ops = { -- Gitee From 79b7e6ed7a6170a8f469c8035db58f163600a410 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Jul 2022 17:38:27 +0800 Subject: [PATCH 030/132] esp: limit skb_page_frag_refill use to a single page stable inclusion from stable-v5.10.113 commit c075c3ea031757f8ea2d34567565b61a868c08d5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c075c3ea031757f8ea2d34567565b61a868c08d5 -------------------------------- [ Upstream commit 5bd8baab087dff657e05387aee802e70304cc813 ] Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP transformation") tried to fix skb_page_frag_refill usage in ESP by capping allocsize to 32k, but that doesn't completely solve the issue, as skb_page_frag_refill may return a single page. If that happens, we will write out of bounds, despite the check introduced in the previous patch. This patch forces COW in cases where we would end up calling skb_page_frag_refill with a size larger than a page (first in esp_output_head with tailen, then in esp_output_tail with skb->data_len). Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- include/net/esp.h | 2 -- net/ipv4/esp4.c | 5 ++--- net/ipv6/esp6.c | 5 ++--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/net/esp.h b/include/net/esp.h index 90cd02ff77ef..9c5637d41d95 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,8 +4,6 @@ #include -#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) - struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9aae82145bc1..20d738137841 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -448,7 +448,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -458,8 +457,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 20c7bef6829e..cb28f8928f9e 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -483,7 +483,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -492,8 +491,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { -- Gitee From f648582b76cf544acd8efbeed6f5fa0020acf160 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 26 Jul 2022 17:38:28 +0800 Subject: [PATCH 031/132] igc: Fix infinite loop in release_swfw_sync stable inclusion from stable-v5.10.113 commit 46b0e4f998ceead45f273be4d282b7fbb70f6125 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=46b0e4f998ceead45f273be4d282b7fbb70f6125 -------------------------------- [ Upstream commit 907862e9aef75bf89e2b265efcc58870be06081e ] An infinite loop may occur if we fail to acquire the HW semaphore, which is needed for resource release. This will typically happen if the hardware is surprise-removed. At this stage there is nothing to do, except log an error and quit. Fixes: c0071c7aa5fe ("igc: Add HW initialization code") Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/intel/igc/igc_i225.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_i225.c b/drivers/net/ethernet/intel/igc/igc_i225.c index 553d6bc78e6b..624236a4202e 100644 --- a/drivers/net/ethernet/intel/igc/igc_i225.c +++ b/drivers/net/ethernet/intel/igc/igc_i225.c @@ -156,8 +156,15 @@ void igc_release_swfw_sync_i225(struct igc_hw *hw, u16 mask) { u32 swfw_sync; - while (igc_get_hw_semaphore_i225(hw)) - ; /* Empty */ + /* Releasing the resource requires first getting the HW semaphore. + * If we fail to get the semaphore, there is nothing we can do, + * except log an error and quit. We are not allowed to hang here + * indefinitely, as it may cause denial of service or system crash. + */ + if (igc_get_hw_semaphore_i225(hw)) { + hw_dbg("Failed to release SW_FW_SYNC.\n"); + return; + } swfw_sync = rd32(IGC_SW_FW_SYNC); swfw_sync &= ~mask; -- Gitee From 39d9023b5d9c8d03091fb5c7c0781836854451bb Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 26 Jul 2022 17:38:29 +0800 Subject: [PATCH 032/132] igc: Fix BUG: scheduling while atomic stable inclusion from stable-v5.10.113 commit fc7116a79a86500e15520658f279b6536ff9f700 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fc7116a79a86500e15520658f279b6536ff9f700 -------------------------------- [ Upstream commit c80a29f0fe9b6f5457e0788e27d1110577eba99b ] Replace usleep_range() method with udelay() method to allow atomic contexts in low-level MDIO access functions. The following issue can be seen by doing the following: $ modprobe -r bonding $ modprobe -v bonding max_bonds=1 mode=1 miimon=100 use_carrier=0 $ ip link set bond0 up $ ifenslave bond0 eth0 eth1 [ 982.357308] BUG: scheduling while atomic: kworker/u64:0/9/0x00000002 [ 982.364431] INFO: lockdep is turned off. [ 982.368824] Modules linked in: bonding sctp ip6_udp_tunnel udp_tunnel mlx4_ib ib_uverbs ib_core mlx4_en mlx4_core nfp tls sunrpc intel_rapl_msr iTCO_wdt iTCO_vendor_support mxm_wmi dcdbas intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel rapl intel_cstate intel_uncore pcspkr lpc_ich mei_me ipmi_ssif mei ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter xfs libcrc32c sr_mod cdrom sd_mod t10_pi sg mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ahci libahci crc32c_intel libata i2c_algo_bit tg3 megaraid_sas igc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: bonding] [ 982.437941] CPU: 25 PID: 9 Comm: kworker/u64:0 Kdump: loaded Tainted: G W --------- - - 4.18.0-348.el8.x86_64+debug #1 [ 982.451333] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 2.7.0 12/005/2017 [ 982.459791] Workqueue: bond0 bond_mii_monitor [bonding] [ 982.465622] Call Trace: [ 982.468355] dump_stack+0x8e/0xd0 [ 982.472056] __schedule_bug.cold.60+0x3a/0x60 [ 982.476919] __schedule+0x147b/0x1bc0 [ 982.481007] ? firmware_map_remove+0x16b/0x16b [ 982.485967] ? hrtimer_fixup_init+0x40/0x40 [ 982.490625] schedule+0xd9/0x250 [ 982.494227] schedule_hrtimeout_range_clock+0x10d/0x2c0 [ 982.500058] ? hrtimer_nanosleep_restart+0x130/0x130 [ 982.505598] ? hrtimer_init_sleeper_on_stack+0x90/0x90 [ 982.511332] ? usleep_range+0x88/0x130 [ 982.515514] ? recalibrate_cpu_khz+0x10/0x10 [ 982.520279] ? ktime_get+0xab/0x1c0 [ 982.524175] ? usleep_range+0x88/0x130 [ 982.528355] usleep_range+0xdd/0x130 [ 982.532344] ? console_conditional_schedule+0x30/0x30 [ 982.537987] ? igc_put_hw_semaphore+0x17/0x60 [igc] [ 982.543432] igc_read_phy_reg_gpy+0x111/0x2b0 [igc] [ 982.548887] igc_phy_has_link+0xfa/0x260 [igc] [ 982.553847] ? igc_get_phy_id+0x210/0x210 [igc] [ 982.558894] ? lock_acquire+0x34d/0x890 [ 982.563187] ? lock_downgrade+0x710/0x710 [ 982.567659] ? rcu_read_unlock+0x50/0x50 [ 982.572039] igc_check_for_copper_link+0x106/0x210 [igc] [ 982.577970] ? igc_config_fc_after_link_up+0x840/0x840 [igc] [ 982.584286] ? rcu_read_unlock+0x50/0x50 [ 982.588661] ? lock_release+0x591/0xb80 [ 982.592939] ? lock_release+0x591/0xb80 [ 982.597220] igc_has_link+0x113/0x330 [igc] [ 982.601887] ? lock_downgrade+0x710/0x710 [ 982.606362] igc_ethtool_get_link+0x6d/0x90 [igc] [ 982.611614] bond_check_dev_link+0x131/0x2c0 [bonding] [ 982.617350] ? bond_time_in_interval+0xd0/0xd0 [bonding] [ 982.623277] ? rcu_read_lock_held+0x62/0xc0 [ 982.627944] ? rcu_read_lock_sched_held+0xe0/0xe0 [ 982.633198] bond_mii_monitor+0x314/0x2500 [bonding] [ 982.638738] ? lock_contended+0x880/0x880 [ 982.643214] ? bond_miimon_link_change+0xa0/0xa0 [bonding] [ 982.649336] ? lock_acquire+0x34d/0x890 [ 982.653615] ? lock_downgrade+0x710/0x710 [ 982.658089] ? debug_object_deactivate+0x221/0x340 [ 982.663436] ? rcu_read_unlock+0x50/0x50 [ 982.667811] ? debug_print_object+0x2b0/0x2b0 [ 982.672672] ? __switch_to_asm+0x41/0x70 [ 982.677049] ? __switch_to_asm+0x35/0x70 [ 982.681426] ? _raw_spin_unlock_irq+0x24/0x40 [ 982.686288] ? trace_hardirqs_on+0x20/0x195 [ 982.690956] ? _raw_spin_unlock_irq+0x24/0x40 [ 982.695818] process_one_work+0x8f0/0x1770 [ 982.700390] ? pwq_dec_nr_in_flight+0x320/0x320 [ 982.705443] ? debug_show_held_locks+0x50/0x50 [ 982.710403] worker_thread+0x87/0xb40 [ 982.714489] ? process_one_work+0x1770/0x1770 [ 982.719349] kthread+0x344/0x410 [ 982.722950] ? kthread_insert_work_sanity_check+0xd0/0xd0 [ 982.728975] ret_from_fork+0x3a/0x50 Fixes: 5586838fe9ce ("igc: Add code for PHY support") Reported-by: Corinna Vinschen Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Corinna Vinschen Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/intel/igc/igc_phy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index e380b7a3ea63..8de4de2e5636 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -583,7 +583,7 @@ static s32 igc_read_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 *data) * the lower time out */ for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) { - usleep_range(500, 1000); + udelay(50); mdic = rd32(IGC_MDIC); if (mdic & IGC_MDIC_READY) break; @@ -640,7 +640,7 @@ static s32 igc_write_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 data) * the lower time out */ for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) { - usleep_range(500, 1000); + udelay(50); mdic = rd32(IGC_MDIC); if (mdic & IGC_MDIC_READY) break; -- Gitee From ab18d18b75f5f270c8f87b9743007d904f37b66e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 26 Jul 2022 17:38:30 +0800 Subject: [PATCH 033/132] rxrpc: Restore removed timer deletion stable inclusion from stable-v5.10.113 commit 60592f16a456d239df9060968dd9cb561d0a6fb0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=60592f16a456d239df9060968dd9cb561d0a6fb0 -------------------------------- [ Upstream commit ee3b0826b4764f6c13ad6db67495c5a1c38e9025 ] A recent patch[1] from Eric Dumazet flipped the order in which the keepalive timer and the keepalive worker were cancelled in order to fix a syzbot reported issue[2]. Unfortunately, this enables the mirror image bug whereby the timer races with rxrpc_exit_net(), restarting the worker after it has been cancelled: CPU 1 CPU 2 =============== ===================== if (rxnet->live) rxnet->live = false; cancel_work_sync(&rxnet->peer_keepalive_work); rxrpc_queue_work(&rxnet->peer_keepalive_work); del_timer_sync(&rxnet->peer_keepalive_timer); Fix this by restoring the removed del_timer_sync() so that we try to remove the timer twice. If the timer runs again, it should see ->live == false and not restart the worker. Fixes: 1946014ca3b1 ("rxrpc: fix a race in rxrpc_exit_net()") Signed-off-by: David Howells cc: Eric Dumazet cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/20220404183439.3537837-1-eric.dumazet@gmail.com/ [1] Link: https://syzkaller.appspot.com/bug?extid=724378c4bb58f703b09a [2] Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/rxrpc/net_ns.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index f15d6942da45..cc7e30733feb 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -113,7 +113,9 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; + del_timer_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); + /* Remove the timer again as the worker may have restarted it. */ del_timer_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); -- Gitee From 27b6e01f60e73dc8944d2d18ed50a60a32e68f08 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 26 Jul 2022 17:38:31 +0800 Subject: [PATCH 034/132] net/smc: Fix sock leak when release after smc_shutdown() stable inclusion from stable-v5.10.113 commit a499cb5f3ef9f976eac96c02adbcc505764e2b91 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a499cb5f3ef9f976eac96c02adbcc505764e2b91 -------------------------------- [ Upstream commit 1a74e99323746353bba11562a2f2d0aa8102f402 ] Since commit e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown and fallback"), for a fallback connection, __smc_release() does not call sock_put() if its state is already SMC_CLOSED. When calling smc_shutdown() after falling back, its state is set to SMC_CLOSED but does not call sock_put(), so this patch calls it. Reported-and-tested-by: syzbot+6e29a053eb165bd50de5@syzkaller.appspotmail.com Fixes: e5d5aadcf3cd ("net/smc: fix sk_refcnt underflow on linkdown and fallback") Signed-off-by: Tony Lu Acked-by: Karsten Graul Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4f16d406ad8e..1b98f3241150 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2144,8 +2144,10 @@ static int smc_shutdown(struct socket *sock, int how) if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sock_put(sk); + } goto out; } switch (how) { -- Gitee From 3059be689186bbdbc081b5cdc9f8b08e78487795 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 26 Jul 2022 17:38:32 +0800 Subject: [PATCH 035/132] net/packet: fix packet_sock xmit return value checking stable inclusion from stable-v5.10.113 commit 8fb76adb89f0d576c107e05fd922bf3d04c470d9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8fb76adb89f0d576c107e05fd922bf3d04c470d9 -------------------------------- [ Upstream commit 29e8e659f984be00d75ec5fef4e37c88def72712 ] packet_sock xmit could be dev_queue_xmit, which also returns negative errors. So only checking positive errors is not enough, or userspace sendmsg may return success while packet is not send out. Move the net_xmit_errno() assignment in the braces as checkpatch.pl said do not use assignment in if condition. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Flavio Leitner Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/packet/af_packet.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d0c95d7dd292..5ee600d108a0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2817,8 +2817,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) status = TP_STATUS_SEND_REQUEST; err = po->xmit(skb); - if (unlikely(err > 0)) { - err = net_xmit_errno(err); + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ @@ -3019,8 +3020,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->no_fcs = 1; err = po->xmit(skb); - if (err > 0 && (err = net_xmit_errno(err)) != 0) - goto out_unlock; + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto out_unlock; + } dev_put(dev); -- Gitee From beeab1369997e4479ef51ac7c3a97460393a01e0 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 26 Jul 2022 17:38:33 +0800 Subject: [PATCH 036/132] ip6_gre: Avoid updating tunnel->tun_hlen in __gre6_xmit() stable inclusion from stable-v5.10.113 commit 200f96ebb389788dea563e8a0b1105863a836039 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=200f96ebb389788dea563e8a0b1105863a836039 -------------------------------- [ Upstream commit f40c064e933d7787ca7411b699504d7a2664c1f5 ] Do not update tunnel->tun_hlen in data plane code. Use a local variable instead, just like "tunnel_hlen" in net/ipv4/ip_gre.c:gre_fb_xmit(). Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/ipv6/ip6_gre.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 9a0263f25232..949d6fbc1ca0 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -743,6 +743,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; __be16 flags; + int tun_hlen; tun_info = skb_tunnel_info_txcheck(skb); if (IS_ERR(tun_info) || @@ -760,9 +761,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, dsfield = key->tos; flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); - tunnel->tun_hlen = gre_calc_hlen(flags); + tun_hlen = gre_calc_hlen(flags); - gre_build_header(skb, tunnel->tun_hlen, + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) -- Gitee From be914f9d3ae55f43b0733a64e2b769bcd02d7d0c Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 26 Jul 2022 17:38:34 +0800 Subject: [PATCH 037/132] ip6_gre: Fix skb_under_panic in __gre6_xmit() stable inclusion from stable-v5.10.113 commit 93366275be72bc55cada31a2a1273147553acab8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=93366275be72bc55cada31a2a1273147553acab8 -------------------------------- [ Upstream commit ab198e1d0dd8dc4bc7575fb50758e2cbd51e14e1 ] Feng reported an skb_under_panic BUG triggered by running test_ip6gretap() in tools/testing/selftests/bpf/test_tunnel.sh: [ 82.492551] skbuff: skb_under_panic: text:ffffffffb268bb8e len:403 put:12 head:ffff9997c5480000 data:ffff9997c547fff8 tail:0x18b end:0x2c0 dev:ip6gretap11 <...> [ 82.607380] Call Trace: [ 82.609389] [ 82.611136] skb_push.cold.109+0x10/0x10 [ 82.614289] __gre6_xmit+0x41e/0x590 [ 82.617169] ip6gre_tunnel_xmit+0x344/0x3f0 [ 82.620526] dev_hard_start_xmit+0xf1/0x330 [ 82.623882] sch_direct_xmit+0xe4/0x250 [ 82.626961] __dev_queue_xmit+0x720/0xfe0 <...> [ 82.633431] packet_sendmsg+0x96a/0x1cb0 [ 82.636568] sock_sendmsg+0x30/0x40 <...> The following sequence of events caused the BUG: 1. During ip6gretap device initialization, tunnel->tun_hlen (e.g. 4) is calculated based on old flags (see ip6gre_calc_hlen()); 2. packet_snd() reserves header room for skb A, assuming tunnel->tun_hlen is 4; 3. Later (in clsact Qdisc), the eBPF program sets a new tunnel key for skb A using bpf_skb_set_tunnel_key() (see _ip6gretap_set_tunnel()); 4. __gre6_xmit() detects the new tunnel key, and recalculates "tun_hlen" (e.g. 12) based on new flags (e.g. TUNNEL_KEY and TUNNEL_SEQ); 5. gre_build_header() calls skb_push() with insufficient reserved header room, triggering the BUG. As sugguested by Cong, fix it by moving the call to skb_cow_head() after the recalculation of tun_hlen. Reproducer: OBJ=$LINUX/tools/testing/selftests/bpf/test_tunnel_kern.o ip netns add at_ns0 ip link add veth0 type veth peer name veth1 ip link set veth0 netns at_ns0 ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up ip link set dev veth1 up mtu 1500 ip addr add dev veth1 172.16.1.200/24 ip netns exec at_ns0 ip addr add ::11/96 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up ip addr add dev veth1 ::22/96 ip link set dev veth1 up ip netns exec at_ns0 \ ip link add dev ip6gretap00 type ip6gretap seq flowlabel 0xbcdef key 2 \ local ::11 remote ::22 ip netns exec at_ns0 ip addr add dev ip6gretap00 10.1.1.100/24 ip netns exec at_ns0 ip addr add dev ip6gretap00 fc80::100/96 ip netns exec at_ns0 ip link set dev ip6gretap00 up ip link add dev ip6gretap11 type ip6gretap external ip addr add dev ip6gretap11 10.1.1.200/24 ip addr add dev ip6gretap11 fc80::200/24 ip link set dev ip6gretap11 up tc qdisc add dev ip6gretap11 clsact tc filter add dev ip6gretap11 egress bpf da obj $OBJ sec ip6gretap_set_tunnel tc filter add dev ip6gretap11 ingress bpf da obj $OBJ sec ip6gretap_get_tunnel ping6 -c 3 -w 10 -q ::11 Fixes: 6712abc168eb ("ip6_gre: add ip6 gre and gretap collect_md mode") Reported-by: Feng Zhou Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/ipv6/ip6_gre.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 949d6fbc1ca0..1f6c752f13b4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -733,9 +733,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) - return -ENOMEM; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -763,6 +760,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tun_hlen = gre_calc_hlen(flags); + if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) + return -ENOMEM; + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), @@ -773,6 +773,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, if (tunnel->parms.o_flags & TUNNEL_SEQ) tunnel->o_seqno++; + if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) + return -ENOMEM; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); -- Gitee From cf223164578f4184522fc77982b2d795305ba27e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2022 17:38:35 +0800 Subject: [PATCH 038/132] net/sched: cls_u32: fix possible leak in u32_init_knode() stable inclusion from stable-v5.10.113 commit 0ac8f83d8f6471e39b3cfe8ee48560df3bf9c451 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0ac8f83d8f6471e39b3cfe8ee48560df3bf9c451 -------------------------------- [ Upstream commit ec5b0f605b105457f257f2870acad4a5d463984b ] While investigating a related syzbot report, I found that whenever call to tcf_exts_init() from u32_init_knode() is failing, we end up with an elevated refcount on ht->refcnt To avoid that, only increase the refcount after all possible errors have been evaluated. Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/sched/cls_u32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index b61db335c49d..da042bc8b239 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -814,10 +814,6 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, ht); - /* bump reference count as long as we hold pointer to structure */ - if (ht) - ht->refcnt++; - #ifdef CONFIG_CLS_U32_PERF /* Statistics may be incremented by readers during update * so we must keep them in tact. When the node is later destroyed @@ -839,6 +835,10 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, return NULL; } + /* bump reference count as long as we hold pointer to structure */ + if (ht) + ht->refcnt++; + return new; } -- Gitee From aa8ab17792740416d50918eaf63bc38ed591fbc2 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Jul 2022 17:38:36 +0800 Subject: [PATCH 039/132] l3mdev: l3mdev_master_upper_ifindex_by_index_rcu should be using netdev_master_upper_dev_get_rcu stable inclusion from stable-v5.10.113 commit 078d839f11acfa9bf5b549d2189abda9f2a943a0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=078d839f11acfa9bf5b549d2189abda9f2a943a0 -------------------------------- [ Upstream commit 83daab06252ee5d0e1f4373ff28b79304945fc19 ] Next patch uses l3mdev_master_upper_ifindex_by_index_rcu which throws a splat with debug kernels: [13783.087570] ------------[ cut here ]------------ [13783.093974] RTNL: assertion failed at net/core/dev.c (6702) [13783.100761] WARNING: CPU: 3 PID: 51132 at net/core/dev.c:6702 netdev_master_upper_dev_get+0x16a/0x1a0 [13783.184226] CPU: 3 PID: 51132 Comm: kworker/3:3 Not tainted 5.17.0-custom-100090-g6f963aafb1cc #682 [13783.194788] Hardware name: Mellanox Technologies Ltd. MSN2010/SA002610, BIOS 5.6.5 08/24/2017 [13783.204755] Workqueue: mld mld_ifc_work [ipv6] [13783.210338] RIP: 0010:netdev_master_upper_dev_get+0x16a/0x1a0 [13783.217209] Code: 0f 85 e3 fe ff ff e8 65 ac ec fe ba 2e 1a 00 00 48 c7 c6 60 6f 38 83 48 c7 c7 c0 70 38 83 c6 05 5e b5 d7 01 01 e8 c6 29 52 00 <0f> 0b e9 b8 fe ff ff e8 5a 6c 35 ff e9 1c ff ff ff 48 89 ef e8 7d [13783.238659] RSP: 0018:ffffc9000b37f5a8 EFLAGS: 00010286 [13783.244995] RAX: 0000000000000000 RBX: ffff88812ee5c000 RCX: 0000000000000000 [13783.253379] RDX: ffff88811ce09d40 RSI: ffffffff812d0fcd RDI: fffff5200166fea7 [13783.261769] RBP: 0000000000000000 R08: 0000000000000001 R09: ffff8882375f4287 [13783.270138] R10: ffffed1046ebe850 R11: 0000000000000001 R12: dffffc0000000000 [13783.278510] R13: 0000000000000275 R14: ffffc9000b37f688 R15: ffff8881273b4af8 [13783.286870] FS: 0000000000000000(0000) GS:ffff888237400000(0000) knlGS:0000000000000000 [13783.296352] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [13783.303177] CR2: 00007ff25fc9b2e8 CR3: 0000000174d23000 CR4: 00000000001006e0 [13783.311546] Call Trace: [13783.314660] [13783.317553] l3mdev_master_upper_ifindex_by_index_rcu+0x43/0xe0 ... Change l3mdev_master_upper_ifindex_by_index_rcu to use netdev_master_upper_dev_get_rcu. Fixes: 6a6d6681ac1a ("l3mdev: add function to retreive upper master") Signed-off-by: Ido Schimmel Signed-off-by: David Ahern Cc: Alexis Bauvin Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/l3mdev/l3mdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 864326f150e2..f2c3a61ad134 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -147,7 +147,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) dev = dev_get_by_index_rcu(net, ifindex); while (dev && !netif_is_l3_master(dev)) - dev = netdev_master_upper_dev_get(dev); + dev = netdev_master_upper_dev_get_rcu(dev); return dev ? dev->ifindex : 0; } -- Gitee From 3bd107cbe765f64dbde8b4282a01282e5a9be749 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2022 17:38:37 +0800 Subject: [PATCH 040/132] ipv6: make ip6_rt_gc_expire an atomic_t stable inclusion from stable-v5.10.113 commit 49516e6ed91434d022a800321a8bc7d8054f62ac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=49516e6ed91434d022a800321a8bc7d8054f62ac -------------------------------- [ Upstream commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db ] Reads and Writes to ip6_rt_gc_expire always have been racy, as syzbot reported lately [1] There is a possible risk of under-flow, leading to unexpected high value passed to fib6_run_gc(), although I have not observed this in the field. Hosts hitting ip6_dst_gc() very hard are under pretty bad state anyway. [1] BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 value changed: 0x00000bb3 -> 0x00000ba9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Conflicts: include/net/netns/ipv6.h Acked-by: Xie XiuQi --- include/net/netns/ipv6.h | 4 ++-- net/ipv6/route.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index b2a28201f4fd..35e0c4a6c71f 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -79,8 +79,8 @@ struct netns_ipv6 { struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a44ad9637e8a..4ef59dc515e5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3187,6 +3187,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); @@ -3197,13 +3198,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; } @@ -6358,7 +6359,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: -- Gitee From 6007bb2484ce25d53b3df5fb1dba2aea01cef119 Mon Sep 17 00:00:00 2001 From: Xu Jia Date: Tue, 26 Jul 2022 17:38:38 +0800 Subject: [PATCH 041/132] ipv6: fix kabi for ip6_rt_gc_expire in struct netns_ipv6 hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH -------------------------------- Making ip6_rt_gc_expire atomic breaks the KABI of struct netns_ipv6. This patch uses KABI_REPLACE to fix it. Signed-off-by: Xu Jia Signed-off-by: Zheng Zengkai Reviewed-by: Yue Haibing Acked-by: Xie XiuQi --- include/net/netns/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 35e0c4a6c71f..31d0cf3c7377 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -79,7 +79,7 @@ struct netns_ipv6 { struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - atomic_t ip6_rt_gc_expire; + KABI_REPLACE(unsigned int ip6_rt_gc_expire, atomic_t ip6_rt_gc_expire) unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; -- Gitee From d7434e9cbc0f809add85594b97bcd8731050cfe9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2022 17:38:39 +0800 Subject: [PATCH 042/132] netlink: reset network and mac headers in netlink_dump() stable inclusion from stable-v5.10.113 commit 3d55b195747c1259c994b868a509beae947a7c6e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3d55b195747c1259c994b868a509beae947a7c6e -------------------------------- [ Upstream commit 99c07327ae11e24886d552dddbe4537bfca2765d ] netlink_dump() is allocating an skb, reserves space in it but forgets to reset network header. This allows a BPF program, invoked later from sk_filter() to access uninitialized kernel memory from the reserved space. Theorically mac header reset could be omitted, because it is set to a special initial value. bpf_internal_load_pointer_neg_helper calls skb_mac_header() without checking skb_mac_header_was_set(). Relying on skb->len not being too big seems fragile. We also could add a sanity check in bpf_internal_load_pointer_neg_helper() to avoid surprises in the future. syzbot report was: BUG: KMSAN: uninit-value in ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637 ___bpf_prog_run+0xa22b/0xb420 kernel/bpf/core.c:1637 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756 bpf_prog_run_save_cb include/linux/filter.h:770 [inline] sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150 sk_filter include/linux/filter.h:905 [inline] netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was stored to memory at: ___bpf_prog_run+0x96c/0xb420 kernel/bpf/core.c:1558 __bpf_prog_run32+0x121/0x180 kernel/bpf/core.c:1796 bpf_dispatcher_nop_func include/linux/bpf.h:784 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] __bpf_prog_run_save_cb+0x168/0x580 include/linux/filter.h:756 bpf_prog_run_save_cb include/linux/filter.h:770 [inline] sk_filter_trim_cap+0x3bc/0x8c0 net/core/filter.c:150 sk_filter include/linux/filter.h:905 [inline] netlink_dump+0xe0c/0x16c0 net/netlink/af_netlink.c:2276 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: slab_post_alloc_hook mm/slab.h:737 [inline] slab_alloc_node mm/slub.c:3244 [inline] __kmalloc_node_track_caller+0xde3/0x14f0 mm/slub.c:4972 kmalloc_reserve net/core/skbuff.c:354 [inline] __alloc_skb+0x545/0xf90 net/core/skbuff.c:426 alloc_skb include/linux/skbuff.h:1158 [inline] netlink_dump+0x30f/0x16c0 net/netlink/af_netlink.c:2242 netlink_recvmsg+0x1129/0x1c80 net/netlink/af_netlink.c:2002 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_read_iter+0x5a9/0x630 net/socket.c:1039 do_iter_readv_writev+0xa7f/0xc70 do_iter_read+0x52c/0x14c0 fs/read_write.c:786 vfs_readv fs/read_write.c:906 [inline] do_readv+0x432/0x800 fs/read_write.c:943 __do_sys_readv fs/read_write.c:1034 [inline] __se_sys_readv fs/read_write.c:1031 [inline] __x64_sys_readv+0xe5/0x120 fs/read_write.c:1031 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x44/0xae CPU: 0 PID: 3470 Comm: syz-executor751 Not tainted 5.17.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: db65a3aaf29e ("netlink: Trim skb to alloc size to avoid MSG_TRUNC") Fixes: 9063e21fb026 ("netlink: autosize skb lengthes") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220415181442.551228-1-eric.dumazet@gmail.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/netlink/af_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f37916156ca5..cbfb601c4ee9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2276,6 +2276,13 @@ static int netlink_dump(struct sock *sk) * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); + + /* Make sure malicious BPF programs can not read unitialized memory + * from skb->head -> skb->data + */ + skb_reset_network_header(skb); + skb_reset_mac_header(skb); + netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { -- Gitee From ceaf8ddd59c3450db33261efc14a28835bafb9ce Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Tue, 26 Jul 2022 17:38:40 +0800 Subject: [PATCH 043/132] net: stmmac: Use readl_poll_timeout_atomic() in atomic state stable inclusion from stable-v5.10.113 commit f593f49fcd174ebe0ebeebecf46fadc134593ddb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f593f49fcd174ebe0ebeebecf46fadc134593ddb -------------------------------- [ Upstream commit 234901de2bc6847eaa0aeb4aba62c31ffb8d3ad6 ] The init_systime() may be invoked in atomic state. We have observed the following call trace when running "phc_ctl /dev/ptp0 set" on a Intel Agilex board. BUG: sleeping function called from invalid context at drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c:74 in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 381, name: phc_ctl preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 Preemption disabled at: [] stmmac_set_time+0x34/0x8c CPU: 2 PID: 381 Comm: phc_ctl Not tainted 5.18.0-rc2-next-20220414-yocto-standard+ #567 Hardware name: SoCFPGA Agilex SoCDK (DT) Call trace: dump_backtrace.part.0+0xc4/0xd0 show_stack+0x24/0x40 dump_stack_lvl+0x7c/0xa0 dump_stack+0x18/0x34 __might_resched+0x154/0x1c0 __might_sleep+0x58/0x90 init_systime+0x78/0x120 stmmac_set_time+0x64/0x8c ptp_clock_settime+0x60/0x9c pc_clock_settime+0x6c/0xc0 __arm64_sys_clock_settime+0x88/0xf0 invoke_syscall+0x5c/0x130 el0_svc_common.constprop.0+0x4c/0x100 do_el0_svc+0x7c/0xa0 el0_svc+0x58/0xcc el0t_64_sync_handler+0xa4/0x130 el0t_64_sync+0x18c/0x190 So we should use readl_poll_timeout_atomic() here instead of readl_poll_timeout(). Also adjust the delay time to 10us to fix a "__bad_udelay" build error reported by "kernel test robot ". I have tested this on Intel Agilex and NXP S32G boards, there is no delay needed at all. So the 10us delay should be long enough for most cases. Fixes: ff8ed737860e ("net: stmmac: use readl_poll_timeout() function in init_systime()") Signed-off-by: Kevin Hao Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 07b1b8374cd2..53efcc9c40e2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -68,9 +68,9 @@ static int init_systime(void __iomem *ioaddr, u32 sec, u32 nsec) writel(value, ioaddr + PTP_TCR); /* wait for present system time initialize to complete */ - return readl_poll_timeout(ioaddr + PTP_TCR, value, + return readl_poll_timeout_atomic(ioaddr + PTP_TCR, value, !(value & PTP_TCR_TSINIT), - 10000, 100000); + 10, 100000); } static int config_addend(void __iomem *ioaddr, u32 addend) -- Gitee From 1d394f14128d12cf5b6313d895622aad5fd5a5ba Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 26 Jul 2022 17:38:41 +0800 Subject: [PATCH 044/132] dmaengine: idxd: add RO check for wq max_batch_size write stable inclusion from stable-v5.10.113 commit 9a3c026dc3a59fa23c63d90752f07538a3c75646 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a3c026dc3a59fa23c63d90752f07538a3c75646 -------------------------------- [ Upstream commit 66903461ffed0b66fc3e0200082d4e09365aacdc ] Block wq_max_batch_size_store() when the device is configured as read-only and not configurable. Fixes: e7184b159dd3 ("dmaengine: idxd: add support for configurable max wq batch size") Reported-by: Bernice Zhang Tested-by: Bernice Zhang Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/164971493551.2201159.1942042593642155209.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/idxd/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 7b41cdff1a2c..5bf4b4be64e4 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1132,6 +1132,9 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu u64 batch_size; int rc; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + if (wq->state != IDXD_WQ_DISABLED) return -EPERM; -- Gitee From c324bed6bcd3c12392271dddbbbd03c23acddd8d Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 26 Jul 2022 17:38:42 +0800 Subject: [PATCH 045/132] dmaengine: idxd: add RO check for wq max_transfer_size write stable inclusion from stable-v5.10.113 commit 520aab8b723cb2bffef957844b634af04e96428d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=520aab8b723cb2bffef957844b634af04e96428d -------------------------------- [ Upstream commit 505a2d1032ae656b0a8c736be110255503941cde ] Block wq_max_transfer_size_store() when the device is configured as read-only and not configurable. Fixes: d7aad5550eca ("dmaengine: idxd: add support for configurable max wq xfer size") Reported-by: Bernice Zhang Tested-by: Bernice Zhang Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/164971488154.2200913.10706665404118545941.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/idxd/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 5bf4b4be64e4..51af0dfc3c63 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1098,6 +1098,9 @@ static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attr u64 xfer_size; int rc; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + if (wq->state != IDXD_WQ_DISABLED) return -EPERM; -- Gitee From aec8147faa831c7bd51628f95d992afe0942c107 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 26 Jul 2022 17:38:43 +0800 Subject: [PATCH 046/132] selftests: mlxsw: vxlan_flooding: Prevent flooding of unwanted packets stable inclusion from stable-v5.10.113 commit 3bf8ca35017024fa1cad55344f798cd5cd131c16 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3bf8ca35017024fa1cad55344f798cd5cd131c16 -------------------------------- [ Upstream commit 044011fdf162c5dd61c02841930c8f438a9adadb ] The test verifies that packets are correctly flooded by the bridge and the VXLAN device by matching on the encapsulated packets at the other end. However, if packets other than those generated by the test also ingress the bridge (e.g., MLD packets), they will be flooded as well and interfere with the expected count. Make the test more robust by making sure that only the packets generated by the test can ingress the bridge. Drop all the rest using tc filters on the egress of 'br0' and 'h1'. In the software data path, the problem can be solved by matching on the inner destination MAC or dropping unwanted packets at the egress of the VXLAN device, but this is not currently supported by mlxsw. Fixes: 94d302deae25 ("selftests: mlxsw: Add a test for VxLAN flooding") Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- .../drivers/net/mlxsw/vxlan_flooding.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh index fedcb7b35af9..af5ea50ed5c0 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh @@ -172,6 +172,17 @@ flooding_filters_add() local lsb local i + # Prevent unwanted packets from entering the bridge and interfering + # with the test. + tc qdisc add dev br0 clsact + tc filter add dev br0 egress protocol all pref 1 handle 1 \ + matchall skip_hw action drop + tc qdisc add dev $h1 clsact + tc filter add dev $h1 egress protocol all pref 1 handle 1 \ + flower skip_hw dst_mac de:ad:be:ef:13:37 action pass + tc filter add dev $h1 egress protocol all pref 2 handle 2 \ + matchall skip_hw action drop + tc qdisc add dev $rp2 clsact for i in $(eval echo {1..$num_remotes}); do @@ -194,6 +205,12 @@ flooding_filters_del() done tc qdisc del dev $rp2 clsact + + tc filter del dev $h1 egress protocol all pref 2 handle 2 matchall + tc filter del dev $h1 egress protocol all pref 1 handle 1 flower + tc qdisc del dev $h1 clsact + tc filter del dev br0 egress protocol all pref 1 handle 1 matchall + tc qdisc del dev br0 clsact } flooding_check_packets() -- Gitee From a260b077acedec14a79fb4dc4e06619ac344af7f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 26 Jul 2022 17:38:44 +0800 Subject: [PATCH 047/132] arm64/mm: Remove [PUD|PMD]_TABLE_BIT from [pud|pmd]_bad() stable inclusion from stable-v5.10.113 commit 18ff7a2efa4e14a6bd86aad31a6a8cebdf0dbabc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=18ff7a2efa4e14a6bd86aad31a6a8cebdf0dbabc -------------------------------- [ Upstream commit e377ab82311af95c99648c6424a6b888a0ccb102 ] Semantics wise, [pud|pmd]_bad() have always implied that a given [PUD|PMD] entry does not have a pointer to the next level page table. This had been made clear in the commit a1c76574f345 ("arm64: mm: use *_sect to check for section maps"). Hence explicitly check for a table entry rather than just testing a single bit. This basically redefines [pud|pmd]_bad() in terms of [pud|pmd]_table() making the semantics clear. Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Acked-by: Mark Rutland Acked-by: Catalin Marinas Acked-by: Mark Rutland Link: https://lore.kernel.org/r/1620644871-26280-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm64/include/asm/pgtable.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index daed33697d98..19d18bca6958 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -516,13 +516,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_none(pmd) (!pmd_val(pmd)) -#define pmd_bad(pmd) (!(pmd_val(pmd) & PMD_TABLE_BIT)) - #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) #define pmd_leaf(pmd) pmd_sect(pmd) +#define pmd_bad(pmd) (!pmd_table(pmd)) #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 static inline bool pud_sect(pud_t pud) { return false; } @@ -606,7 +605,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) +#define pud_bad(pud) (!pud_table(pud)) #define pud_present(pud) pte_present(pud_pte(pud)) #define pud_leaf(pud) pud_sect(pud) #define pud_valid(pud) pte_valid(pud_pte(pud)) -- Gitee From 286861f72d40a9d40e51210f620f52c090483b8a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 26 Jul 2022 17:38:45 +0800 Subject: [PATCH 048/132] arm64: mm: fix p?d_leaf() stable inclusion from stable-v5.10.113 commit 052e4a661f90d27586b4364764dcfb2c022b00ee category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=052e4a661f90d27586b4364764dcfb2c022b00ee -------------------------------- [ Upstream commit 23bc8f69f0eceecbb87c3801d2e48827d2dca92b ] The pmd_leaf() is used to test a leaf mapped PMD, however, it misses the PROT_NONE mapped PMD on arm64. Fix it. A real world issue [1] caused by this was reported by Qian Cai. Also fix pud_leaf(). Link: https://patchwork.kernel.org/comment/24798260/ [1] Fixes: 8aa82df3c123 ("arm64: mm: add p?d_leaf() definitions") Reported-by: Qian Cai Signed-off-by: Muchun Song Link: https://lore.kernel.org/r/20220422060033.48711-1-songmuchun@bytedance.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm64/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 19d18bca6958..ab2443900f4e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -520,7 +520,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) -#define pmd_leaf(pmd) pmd_sect(pmd) +#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd)) #define pmd_bad(pmd) (!pmd_table(pmd)) #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 @@ -607,7 +607,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (!pud_table(pud)) #define pud_present(pud) pte_present(pud_pte(pud)) -#define pud_leaf(pud) pud_sect(pud) +#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) #define pud_valid(pud) pte_valid(pud_pte(pud)) static inline void set_pud(pud_t *pudp, pud_t pud) -- Gitee From be31be301f9aa6457bb1cd1ac8ba374db3ecba79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 26 Jul 2022 17:38:46 +0800 Subject: [PATCH 049/132] ARM: vexpress/spc: Avoid negative array index when !SMP stable inclusion from stable-v5.10.113 commit d513ea9b7ef822b638857c7114d26af108e4bab1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=d513ea9b7ef822b638857c7114d26af108e4bab1 -------------------------------- [ Upstream commit b3f1dd52c991d79118f35e6d1bf4d7cb09882e38 ] When building multi_v7_defconfig+CONFIG_SMP=n, -Warray-bounds exposes a couple negative array index accesses: arch/arm/mach-vexpress/spc.c: In function 've_spc_clk_init': arch/arm/mach-vexpress/spc.c:583:21: warning: array subscript -1 is below array bounds of 'bool[2]' {aka '_Bool[2]'} [-Warray-bounds] 583 | if (init_opp_table[cluster]) | ~~~~~~~~~~~~~~^~~~~~~~~ arch/arm/mach-vexpress/spc.c:556:7: note: while referencing 'init_opp_table' 556 | bool init_opp_table[MAX_CLUSTERS] = { false }; | ^~~~~~~~~~~~~~ arch/arm/mach-vexpress/spc.c:592:18: warning: array subscript -1 is below array bounds of 'bool[2]' {aka '_Bool[2]'} [-Warray-bounds] 592 | init_opp_table[cluster] = true; | ~~~~~~~~~~~~~~^~~~~~~~~ arch/arm/mach-vexpress/spc.c:556:7: note: while referencing 'init_opp_table' 556 | bool init_opp_table[MAX_CLUSTERS] = { false }; | ^~~~~~~~~~~~~~ Skip this logic when built !SMP. Link: https://lore.kernel.org/r/20220331190443.851661-1-keescook@chromium.org Cc: Liviu Dudau Cc: Sudeep Holla Cc: Lorenzo Pieralisi Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Acked-by: Liviu Dudau Signed-off-by: Kees Cook Signed-off-by: Sudeep Holla Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm/mach-vexpress/spc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-vexpress/spc.c b/arch/arm/mach-vexpress/spc.c index 1da11bdb1dfb..1c6500c4e6a1 100644 --- a/arch/arm/mach-vexpress/spc.c +++ b/arch/arm/mach-vexpress/spc.c @@ -580,7 +580,7 @@ static int __init ve_spc_clk_init(void) } cluster = topology_physical_package_id(cpu_dev->id); - if (init_opp_table[cluster]) + if (cluster < 0 || init_opp_table[cluster]) continue; if (ve_init_opp_table(cpu_dev)) -- Gitee From c8a5122e572911c692240f7a894883779217a520 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Tue, 26 Jul 2022 17:38:47 +0800 Subject: [PATCH 050/132] reset: tegra-bpmp: Restore Handle errors in BPMP response stable inclusion from stable-v5.10.113 commit cb17b56a9b4de46c057691b524c692c10e8d9d51 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cb17b56a9b4de46c057691b524c692c10e8d9d51 -------------------------------- [ Upstream commit d1da1052ffad63aa5181b69f20a6952e31f339c2 ] This reverts following commit 69125b4b9440 ("reset: tegra-bpmp: Revert Handle errors in BPMP response"). The Tegra194 HDA reset failure is fixed by commit d278dc9151a0 ("ALSA: hda/tegra: Fix Tegra194 HDA reset failure"). The temporary revert of original commit c045ceb5a145 ("reset: tegra-bpmp: Handle errors in BPMP response") can be removed now. Signed-off-by: Sameer Pujar Tested-by: Jon Hunter Reviewed-by: Jon Hunter Acked-by: Thierry Reding Signed-off-by: Philipp Zabel Link: https://lore.kernel.org/r/1641995806-15245-1-git-send-email-spujar@nvidia.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/reset/tegra/reset-bpmp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/reset/tegra/reset-bpmp.c b/drivers/reset/tegra/reset-bpmp.c index 24d3395964cc..4c5bba52b105 100644 --- a/drivers/reset/tegra/reset-bpmp.c +++ b/drivers/reset/tegra/reset-bpmp.c @@ -20,6 +20,7 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc, struct tegra_bpmp *bpmp = to_tegra_bpmp(rstc); struct mrq_reset_request request; struct tegra_bpmp_message msg; + int err; memset(&request, 0, sizeof(request)); request.cmd = command; @@ -30,7 +31,13 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc, msg.tx.data = &request; msg.tx.size = sizeof(request); - return tegra_bpmp_transfer(bpmp, &msg); + err = tegra_bpmp_transfer(bpmp, &msg); + if (err) + return err; + if (msg.rx.ret) + return -EINVAL; + + return 0; } static int tegra_bpmp_reset_module(struct reset_controller_dev *rstc, -- Gitee From f0894049e9ec662a3fbb60d84f3124663a1efb9b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 26 Jul 2022 17:38:48 +0800 Subject: [PATCH 051/132] platform/x86: samsung-laptop: Fix an unsigned comparison which can never be negative stable inclusion from stable-v5.10.113 commit 490815f0b50e42238a22ea4b2dc64105f5a59e6f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=490815f0b50e42238a22ea4b2dc64105f5a59e6f -------------------------------- [ Upstream commit 0284d4d1be753f648f28b77bdfbe6a959212af5c ] Eliminate the follow smatch warnings: drivers/platform/x86/samsung-laptop.c:1124 kbd_led_set() warn: unsigned 'value' is never less than zero. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20220322061830.105579-1-jiapeng.chong@linux.alibaba.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/platform/x86/samsung-laptop.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c index d5cec6e35bb8..0e456c39a603 100644 --- a/drivers/platform/x86/samsung-laptop.c +++ b/drivers/platform/x86/samsung-laptop.c @@ -1121,8 +1121,6 @@ static void kbd_led_set(struct led_classdev *led_cdev, if (value > samsung->kbd_led.max_brightness) value = samsung->kbd_led.max_brightness; - else if (value < 0) - value = 0; samsung->kbd_led_wk = value; queue_work(samsung->led_workqueue, &samsung->kbd_led_work); -- Gitee From 45f7650be7ed083d437c26a46a5148ee50c5641b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jul 2022 17:38:49 +0800 Subject: [PATCH 052/132] ALSA: usb-audio: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit cd227ac03f2aa918ee3672fe2a5960dd0e7be331 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cd227ac03f2aa918ee3672fe2a5960dd0e7be331 -------------------------------- [ Upstream commit 1ef8715975de8bd481abbd0839ed4f49d9e5b0ff ] Fix: sound/usb/midi.c: In function ‘snd_usbmidi_out_endpoint_create’: sound/usb/midi.c:1389:2: error: case label does not reduce to an integer constant case USB_ID(0xfc08, 0x0101): /* Unknown vendor Cable */ ^~~~ See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. [ A slight correction with parentheses around the argument by tiwai ] Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220405151517.29753-3-bp@alien8.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/usb/usbaudio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index e54a98f46549..d8e31ee03b9d 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -8,7 +8,7 @@ */ /* handling of USB vendor/product ID pairs as 32-bit numbers */ -#define USB_ID(vendor, product) (((vendor) << 16) | (product)) +#define USB_ID(vendor, product) (((unsigned int)(vendor) << 16) | (product)) #define USB_ID_VENDOR(id) ((id) >> 16) #define USB_ID_PRODUCT(id) ((u16)(id)) -- Gitee From bade8d4a2dbd0d43173f01e1a0ec525814c13f44 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 26 Jul 2022 17:38:50 +0800 Subject: [PATCH 053/132] arm64: dts: imx: Fix imx8*-var-som touchscreen property sizes stable inclusion from stable-v5.10.113 commit 8e7ea11364758d43e577b7835b8e98f27927d56c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8e7ea11364758d43e577b7835b8e98f27927d56c -------------------------------- [ Upstream commit 1bc12d301594eafde0a8529d28d459af81053b3a ] The common touchscreen properties are all 32-bit, not 16-bit. These properties must not be too important as they are all ignored in case of an error reading them. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/Yk3moe6Hz8ELM0iS@robh.at.kernel.org' Signed-off-by: Arnd Bergmann Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi | 8 ++++---- arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi index 49082529764f..0fac1f3f7f47 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi @@ -89,12 +89,12 @@ touchscreen@0 { pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <125>; - touchscreen-size-x = /bits/ 16 <4008>; + touchscreen-size-x = <4008>; ti,y-min = /bits/ 16 <282>; - touchscreen-size-y = /bits/ 16 <3864>; + touchscreen-size-y = <3864>; ti,x-plate-ohms = /bits/ 16 <180>; - touchscreen-max-pressure = /bits/ 16 <255>; - touchscreen-average-samples = /bits/ 16 <10>; + touchscreen-max-pressure = <255>; + touchscreen-average-samples = <10>; ti,debounce-tol = /bits/ 16 <3>; ti,debounce-rep = /bits/ 16 <1>; ti,settle-delay-usec = /bits/ 16 <150>; diff --git a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi index 7f356edf9f91..f6287f174355 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi @@ -70,12 +70,12 @@ touchscreen@0 { pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <125>; - touchscreen-size-x = /bits/ 16 <4008>; + touchscreen-size-x = <4008>; ti,y-min = /bits/ 16 <282>; - touchscreen-size-y = /bits/ 16 <3864>; + touchscreen-size-y = <3864>; ti,x-plate-ohms = /bits/ 16 <180>; - touchscreen-max-pressure = /bits/ 16 <255>; - touchscreen-average-samples = /bits/ 16 <10>; + touchscreen-max-pressure = <255>; + touchscreen-average-samples = <10>; ti,debounce-tol = /bits/ 16 <3>; ti,debounce-rep = /bits/ 16 <1>; ti,settle-delay-usec = /bits/ 16 <150>; -- Gitee From b9a0a6d062e89d410cd33f03b17ed7840e096128 Mon Sep 17 00:00:00 2001 From: Hongbin Wang Date: Tue, 26 Jul 2022 17:38:51 +0800 Subject: [PATCH 054/132] vxlan: fix error return code in vxlan_fdb_append stable inclusion from stable-v5.10.113 commit e129c55153c8aba74e4c73451399f12dacf6347d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e129c55153c8aba74e4c73451399f12dacf6347d -------------------------------- [ Upstream commit 7cea5560bf656b84f9ed01c0cc829d4eecd0640b ] When kmalloc and dst_cache_init failed, should return ENOMEM rather than ENOBUFS. Signed-off-by: Hongbin Wang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/vxlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c362a6ac94c4..73953b0141ba 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -710,11 +710,11 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) - return -ENOBUFS; + return -ENOMEM; if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { kfree(rd); - return -ENOBUFS; + return -ENOMEM; } rd->remote_ip = *ip; -- Gitee From 62f154bc19641ca00a8b926c5d28d3d1271eea1c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 26 Jul 2022 17:38:52 +0800 Subject: [PATCH 055/132] cifs: Check the IOCB_DIRECT flag, not O_DIRECT stable inclusion from stable-v5.10.113 commit 5bef9fc38ffa3e0a712fcd1e07f4f123ba160242 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5bef9fc38ffa3e0a712fcd1e07f4f123ba160242 -------------------------------- [ Upstream commit 994fd530a512597ffcd713b0f6d5bc916c5698f0 ] Use the IOCB_DIRECT indicator flag on the I/O context rather than checking to see if the file was opened O_DIRECT. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Rohith Surabattula cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index aa5a4d759ca2..370188b2a55d 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -898,7 +898,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return cifs_user_readv(iocb, iter); rc = cifs_revalidate_mapping(inode); -- Gitee From 652cce7aeaa6bf232f67c210d878f48733c34045 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 26 Jul 2022 17:38:53 +0800 Subject: [PATCH 056/132] net: atlantic: Avoid out-of-bounds indexing stable inclusion from stable-v5.10.113 commit 0de9c104d04aed5e464bd788abb423ed333980c7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0de9c104d04aed5e464bd788abb423ed333980c7 -------------------------------- [ Upstream commit 8d3a6c37d50d5a0504c126c932cc749e6dd9c78f ] UBSAN warnings are observed on atlantic driver: [ 294.432996] UBSAN: array-index-out-of-bounds in /build/linux-Qow4fL/linux-5.15.0/drivers/net/ethernet/aquantia/atlantic/aq_nic.c:484:48 [ 294.433695] index 8 is out of range for type 'aq_vec_s *[8]' The ring is dereferenced right before breaking out the loop, to prevent that from happening, only use the index in the loop to fix the issue. BugLink: https://bugs.launchpad.net/bugs/1958770 Tested-by: Mario Limonciello Signed-off-by: Kai-Heng Feng Reviewed-by: Igor Russkikh Link: https://lore.kernel.org/r/20220408022204.16815-1-kai.heng.feng@canonical.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- .../net/ethernet/aquantia/atlantic/aq_nic.c | 8 +++---- .../net/ethernet/aquantia/atlantic/aq_vec.c | 24 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 0cf8ae8aeac8..2fb4126ae8d8 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -480,8 +480,8 @@ int aq_nic_start(struct aq_nic_s *self) if (err < 0) goto err_exit; - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) { + for (i = 0U; self->aq_vecs > i; ++i) { + aq_vec = self->aq_vec[i]; err = aq_vec_start(aq_vec); if (err < 0) goto err_exit; @@ -511,8 +511,8 @@ int aq_nic_start(struct aq_nic_s *self) mod_timer(&self->polling_timer, jiffies + AQ_CFG_POLLING_TIMER_INTERVAL); } else { - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) { + for (i = 0U; self->aq_vecs > i; ++i) { + aq_vec = self->aq_vec[i]; err = aq_pci_func_alloc_irq(self, i, self->ndev->name, aq_vec_isr, aq_vec, aq_vec_get_affinity_mask(aq_vec)); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index f4774cf051c9..6ab1f3212d24 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -43,8 +43,8 @@ static int aq_vec_poll(struct napi_struct *napi, int budget) if (!self) { err = -EINVAL; } else { - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; u64_stats_update_begin(&ring[AQ_VEC_RX_ID].stats.rx.syncp); ring[AQ_VEC_RX_ID].stats.rx.polls++; u64_stats_update_end(&ring[AQ_VEC_RX_ID].stats.rx.syncp); @@ -182,8 +182,8 @@ int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops, self->aq_hw_ops = aq_hw_ops; self->aq_hw = aq_hw; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; err = aq_ring_init(&ring[AQ_VEC_TX_ID], ATL_RING_TX); if (err < 0) goto err_exit; @@ -224,8 +224,8 @@ int aq_vec_start(struct aq_vec_s *self) unsigned int i = 0U; int err = 0; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; err = self->aq_hw_ops->hw_ring_tx_start(self->aq_hw, &ring[AQ_VEC_TX_ID]); if (err < 0) @@ -248,8 +248,8 @@ void aq_vec_stop(struct aq_vec_s *self) struct aq_ring_s *ring = NULL; unsigned int i = 0U; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; self->aq_hw_ops->hw_ring_tx_stop(self->aq_hw, &ring[AQ_VEC_TX_ID]); @@ -268,8 +268,8 @@ void aq_vec_deinit(struct aq_vec_s *self) if (!self) goto err_exit; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]); aq_ring_rx_deinit(&ring[AQ_VEC_RX_ID]); } @@ -297,8 +297,8 @@ void aq_vec_ring_free(struct aq_vec_s *self) if (!self) goto err_exit; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; aq_ring_free(&ring[AQ_VEC_TX_ID]); if (i < self->rx_rings) aq_ring_free(&ring[AQ_VEC_RX_ID]); -- Gitee From e10c83e7cbb797c3b2d7dbe6c704fa84cebf6fc6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jul 2022 17:38:54 +0800 Subject: [PATCH 057/132] mt76: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit 202748f441488e08c730736bf5def1ae08f16811 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=202748f441488e08c730736bf5def1ae08f16811 -------------------------------- [ Upstream commit dbc2b1764734857d68425468ffa8486e97ab89df ] Fix: drivers/net/wireless/mediatek/mt76/mt76x2/pci.c: In function ‘mt76x2e_probe’: ././include/linux/compiler_types.h:352:38: error: call to ‘__compiletime_assert_946’ \ declared with attribute error: FIELD_PREP: mask is not constant _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. Signed-off-by: Borislav Petkov Cc: Felix Fietkau Cc: Lorenzo Bianconi Cc: Ryder Lee Cc: Shayne Chen Cc: Sean Wang Cc: Kalle Valo Cc: "David S. Miller" Cc: Jakub Kicinski Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220405151517.29753-9-bp@alien8.de Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/wireless/mediatek/mt76/mt76x2/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c index ecaf85b483ac..e57e49a722dc 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c @@ -80,7 +80,7 @@ mt76x2e_probe(struct pci_dev *pdev, const struct pci_device_id *id) mt76_rmw_field(dev, 0x15a10, 0x1f << 16, 0x9); /* RG_SSUSB_G1_CDR_BIC_LTR = 0xf */ - mt76_rmw_field(dev, 0x15a0c, 0xf << 28, 0xf); + mt76_rmw_field(dev, 0x15a0c, 0xfU << 28, 0xf); /* RG_SSUSB_CDR_BR_PE1D = 0x3 */ mt76_rmw_field(dev, 0x15c58, 0x3 << 6, 0x3); -- Gitee From d89078f497c0d04cb4ec777eca7bc4a451c68b00 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jul 2022 17:38:55 +0800 Subject: [PATCH 058/132] brcmfmac: sdio: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit b3afe5a7fd7548fa3a818f8dd20e22db4b8db019 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b3afe5a7fd7548fa3a818f8dd20e22db4b8db019 -------------------------------- [ Upstream commit 6fb3a5868b2117611f41e421e10e6a8c2a13039a ] Fix: drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c: In function ‘brcmf_sdio_drivestrengthinit’: drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:3798:2: error: case label does not reduce to an integer constant case SDIOD_DRVSTR_KEY(BRCM_CC_43143_CHIP_ID, 17): ^~~~ drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:3809:2: error: case label does not reduce to an integer constant case SDIOD_DRVSTR_KEY(BRCM_CC_43362_CHIP_ID, 13): ^~~~ See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. Signed-off-by: Borislav Petkov Cc: Arend van Spriel Cc: Franky Lin Cc: Hante Meuleman Cc: Kalle Valo Cc: "David S. Miller" Cc: Jakub Kicinski Cc: brcm80211-dev-list.pdl@broadcom.com Cc: netdev@vger.kernel.org Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/Ykx0iRlvtBnKqtbG@zn.tnic Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 6d5d5c39c635..9929e90866f0 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -557,7 +557,7 @@ enum brcmf_sdio_frmtype { BRCMF_SDIO_FT_SUB, }; -#define SDIOD_DRVSTR_KEY(chip, pmu) (((chip) << 16) | (pmu)) +#define SDIOD_DRVSTR_KEY(chip, pmu) (((unsigned int)(chip) << 16) | (pmu)) /* SDIO Pad drive strength to select value mappings */ struct sdiod_drive_str { -- Gitee From d1c00ee05775fa5b9a20a1392fe710e785fc68c9 Mon Sep 17 00:00:00 2001 From: Lv Ruyi Date: Tue, 26 Jul 2022 17:38:56 +0800 Subject: [PATCH 059/132] dpaa_eth: Fix missing of_node_put in dpaa_get_ts_info() stable inclusion from stable-v5.10.113 commit 8d71edabb0abe6c8e3e1601e73f63f81783b6cc0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8d71edabb0abe6c8e3e1601e73f63f81783b6cc0 -------------------------------- [ Upstream commit 1a7eb80d170c28be2928433702256fe2a0bd1e0f ] Both of of_get_parent() and of_parse_phandle() return node pointer with refcount incremented, use of_node_put() on it to decrease refcount when done. Reported-by: Zeal Robot Signed-off-by: Lv Ruyi Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c index 763d2c7b5fb1..5750f9a56393 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c @@ -489,11 +489,15 @@ static int dpaa_get_ts_info(struct net_device *net_dev, info->phc_index = -1; fman_node = of_get_parent(mac_node); - if (fman_node) + if (fman_node) { ptp_node = of_parse_phandle(fman_node, "ptimer-handle", 0); + of_node_put(fman_node); + } - if (ptp_node) + if (ptp_node) { ptp_dev = of_find_device_by_node(ptp_node); + of_node_put(ptp_node); + } if (ptp_dev) ptp = platform_get_drvdata(ptp_dev); -- Gitee From 63fc6600e0de84218fd9587b1037e42025fb27c9 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 26 Jul 2022 17:38:57 +0800 Subject: [PATCH 060/132] drm/msm/mdp5: check the return of kzalloc() stable inclusion from stable-v5.10.113 commit 9581e07b549bd911ce3af12b57ea2646b4995ebf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9581e07b549bd911ce3af12b57ea2646b4995ebf -------------------------------- [ Upstream commit 047ae665577776b7feb11bd4f81f46627cff95e7 ] kzalloc() is a memory allocation function which can return NULL when some internal memory errors happen. So it is better to check it to prevent potential wrong memory access. Besides, since mdp5_plane_reset() is void type, so we should better set `plane-state` to NULL after releasing it. Signed-off-by: Xiaoke Wang Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/481055/ Link: https://lore.kernel.org/r/tencent_8E2A1C78140EE1784AB2FF4B2088CC0AB908@qq.com Signed-off-by: Dmitry Baryshkov Signed-off-by: Rob Clark Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c index 83423092de2f..da0799333970 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c @@ -179,7 +179,10 @@ static void mdp5_plane_reset(struct drm_plane *plane) drm_framebuffer_put(plane->state->fb); kfree(to_mdp5_plane_state(plane->state)); + plane->state = NULL; mdp5_state = kzalloc(sizeof(*mdp5_state), GFP_KERNEL); + if (!mdp5_state) + return; /* assign default blend parameters */ mdp5_state->alpha = 255; -- Gitee From 6f2cb02318c126edf72a06c12938b3bc93a7de1a Mon Sep 17 00:00:00 2001 From: Tomas Melin Date: Tue, 26 Jul 2022 17:38:58 +0800 Subject: [PATCH 061/132] net: macb: Restart tx only if queue pointer is lagging stable inclusion from stable-v5.10.113 commit a284cca3d81ae5e5f5f23c09b8298339188953a5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a284cca3d81ae5e5f5f23c09b8298339188953a5 -------------------------------- [ Upstream commit 5ad7f18cd82cee8e773d40cc7a1465a526f2615c ] commit 4298388574da ("net: macb: restart tx after tx used bit read") added support for restarting transmission. Restarting tx does not work in case controller asserts TXUBR interrupt and TQBP is already at the end of the tx queue. In that situation, restarting tx will immediately cause assertion of another TXUBR interrupt. The driver will end up in an infinite interrupt loop which it cannot break out of. For cases where TQBP is at the end of the tx queue, instead only clear TX_USED interrupt. As more data gets pushed to the queue, transmission will resume. This issue was observed on a Xilinx Zynq-7000 based board. During stress test of the network interface, driver would get stuck on interrupt loop within seconds or minutes causing CPU to stall. Signed-off-by: Tomas Melin Tested-by: Claudiu Beznea Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220407161659.14532-1-tomas.melin@vaisala.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/cadence/macb_main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 78c6d133f54f..3244f69555f7 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1531,6 +1531,7 @@ static void macb_tx_restart(struct macb_queue *queue) unsigned int head = queue->tx_head; unsigned int tail = queue->tx_tail; struct macb *bp = queue->bp; + unsigned int head_idx, tbqp; if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) queue_writel(queue, ISR, MACB_BIT(TXUBR)); @@ -1538,6 +1539,13 @@ static void macb_tx_restart(struct macb_queue *queue) if (head == tail) return; + tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp); + tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp)); + head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head)); + + if (tbqp == head_idx) + return; + macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART)); } -- Gitee From 0389f1ff99ad7375ff5980b964ab00267a72af25 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 26 Jul 2022 17:38:59 +0800 Subject: [PATCH 062/132] scsi: qedi: Fix failed disconnect handling stable inclusion from stable-v5.10.113 commit bf28bba3041055759ce6afe5e61bc4fd709ea6e4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=bf28bba3041055759ce6afe5e61bc4fd709ea6e4 -------------------------------- [ Upstream commit 857b06527f707f5df634b854898a191b5c1d0272 ] We set the qedi_ep state to EP_STATE_OFLDCONN_START when the ep is created. Then in qedi_set_path we kick off the offload work. If userspace times out the connection and calls ep_disconnect, qedi will only flush the offload work if the qedi_ep state has transitioned away from EP_STATE_OFLDCONN_START. If we can't connect we will not have transitioned state and will leave the offload work running, and we will free the qedi_ep from under it. This patch just has us init the work when we create the ep, then always flush it. Link: https://lore.kernel.org/r/20220408001314.5014-10-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Acked-by: Manish Rangankar Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/scsi/qedi/qedi_iscsi.c | 69 +++++++++++++++++----------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index ba9a22e55e32..8003b3519b95 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -828,6 +828,37 @@ static int qedi_task_xmit(struct iscsi_task *task) return qedi_iscsi_send_ioreq(task); } +static void qedi_offload_work(struct work_struct *work) +{ + struct qedi_endpoint *qedi_ep = + container_of(work, struct qedi_endpoint, offload_work); + struct qedi_ctx *qedi; + int wait_delay = 5 * HZ; + int ret; + + qedi = qedi_ep->qedi; + + ret = qedi_iscsi_offload_conn(qedi_ep); + if (ret) { + QEDI_ERR(&qedi->dbg_ctx, + "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n", + qedi_ep->iscsi_cid, qedi_ep, ret); + qedi_ep->state = EP_STATE_OFLDCONN_FAILED; + return; + } + + ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait, + (qedi_ep->state == + EP_STATE_OFLDCONN_COMPL), + wait_delay); + if (ret <= 0 || qedi_ep->state != EP_STATE_OFLDCONN_COMPL) { + qedi_ep->state = EP_STATE_OFLDCONN_FAILED; + QEDI_ERR(&qedi->dbg_ctx, + "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n", + qedi_ep->iscsi_cid, qedi_ep); + } +} + static struct iscsi_endpoint * qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, int non_blocking) @@ -876,6 +907,7 @@ qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, } qedi_ep = ep->dd_data; memset(qedi_ep, 0, sizeof(struct qedi_endpoint)); + INIT_WORK(&qedi_ep->offload_work, qedi_offload_work); qedi_ep->state = EP_STATE_IDLE; qedi_ep->iscsi_cid = (u32)-1; qedi_ep->qedi = qedi; @@ -1026,12 +1058,11 @@ static void qedi_ep_disconnect(struct iscsi_endpoint *ep) qedi_ep = ep->dd_data; qedi = qedi_ep->qedi; + flush_work(&qedi_ep->offload_work); + if (qedi_ep->state == EP_STATE_OFLDCONN_START) goto ep_exit_recover; - if (qedi_ep->state != EP_STATE_OFLDCONN_NONE) - flush_work(&qedi_ep->offload_work); - if (qedi_ep->conn) { qedi_conn = qedi_ep->conn; conn = qedi_conn->cls_conn->dd_data; @@ -1196,37 +1227,6 @@ static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid) return rc; } -static void qedi_offload_work(struct work_struct *work) -{ - struct qedi_endpoint *qedi_ep = - container_of(work, struct qedi_endpoint, offload_work); - struct qedi_ctx *qedi; - int wait_delay = 5 * HZ; - int ret; - - qedi = qedi_ep->qedi; - - ret = qedi_iscsi_offload_conn(qedi_ep); - if (ret) { - QEDI_ERR(&qedi->dbg_ctx, - "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n", - qedi_ep->iscsi_cid, qedi_ep, ret); - qedi_ep->state = EP_STATE_OFLDCONN_FAILED; - return; - } - - ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait, - (qedi_ep->state == - EP_STATE_OFLDCONN_COMPL), - wait_delay); - if ((ret <= 0) || (qedi_ep->state != EP_STATE_OFLDCONN_COMPL)) { - qedi_ep->state = EP_STATE_OFLDCONN_FAILED; - QEDI_ERR(&qedi->dbg_ctx, - "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n", - qedi_ep->iscsi_cid, qedi_ep); - } -} - static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data) { struct qedi_ctx *qedi; @@ -1342,7 +1342,6 @@ static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data) qedi_ep->dst_addr, qedi_ep->dst_port); } - INIT_WORK(&qedi_ep->offload_work, qedi_offload_work); queue_work(qedi->offload_thread, &qedi_ep->offload_work); ret = 0; -- Gitee From 7bec28503270e64c6bd754f2d882beeef7c3278a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 26 Jul 2022 17:39:00 +0800 Subject: [PATCH 063/132] stat: fix inconsistency between struct stat and struct compat_stat stable inclusion from stable-v5.10.113 commit 76101c8e0c31938e59bbcb4beed47c8d8b89d39a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=76101c8e0c31938e59bbcb4beed47c8d8b89d39a -------------------------------- [ Upstream commit 932aba1e169090357a77af18850a10c256b50819 ] struct stat (defined in arch/x86/include/uapi/asm/stat.h) has 32-bit st_dev and st_rdev; struct compat_stat (defined in arch/x86/include/asm/compat.h) has 16-bit st_dev and st_rdev followed by a 16-bit padding. This patch fixes struct compat_stat to match struct stat. [ Historical note: the old x86 'struct stat' did have that 16-bit field that the compat layer had kept around, but it was changes back in 2003 by "struct stat - support larger dev_t": https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=e95b2065677fe32512a597a79db94b77b90c968d and back in those days, the x86_64 port was still new, and separate from the i386 code, and had already picked up the old version with a 16-bit st_dev field ] Note that we can't change compat_dev_t because it is used by compat_loop_info. Also, if the st_dev and st_rdev values are 32-bit, we don't have to use old_valid_dev to test if the value fits into them. This fixes -EOVERFLOW on filesystems that are on NVMe because NVMe uses the major number 259. Signed-off-by: Mikulas Patocka Cc: Andreas Schwab Cc: Matthew Wilcox Cc: Christoph Hellwig Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/x86/include/asm/compat.h | 6 ++---- fs/stat.c | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 0e327a01f50f..46a067bd7e0b 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -29,15 +29,13 @@ typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { - compat_dev_t st_dev; - u16 __pad1; + u32 st_dev; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; - compat_dev_t st_rdev; - u16 __pad2; + u32 st_rdev; u32 st_size; u32 st_blksize; u32 st_blocks; diff --git a/fs/stat.c b/fs/stat.c index 1196af4d1ea0..04550c0ba540 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -306,9 +306,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev(x),true) -#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif @@ -317,7 +314,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; - if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -325,7 +324,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) #endif INIT_STRUCT_STAT_PADDING(tmp); - tmp.st_dev = encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -335,7 +334,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -616,11 +615,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -630,7 +631,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; -- Gitee From 47590400345a095df3f6c386c59924849fc0fe28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jul 2022 17:39:01 +0800 Subject: [PATCH 064/132] nvme: add a quirk to disable namespace identifiers stable inclusion from stable-v5.10.113 commit 316bd86c2261fffa37577c3d35721a9337248e54 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=316bd86c2261fffa37577c3d35721a9337248e54 -------------------------------- [ Upstream commit 00ff400e6deee00f7b15e200205b2708b63b8cf6 ] Add a quirk to disable using and exporting namespace identifiers for controllers where they are broken beyond repair. The most directly visible problem with non-unique namespace identifiers is that they break the /dev/disk/by-id/ links, with the link for a supposedly unique identifier now pointing to one of multiple possible namespaces that share the same ID, and a somewhat random selection of which one actually shows up. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/nvme/host/core.c | 24 ++++++++++++++++++------ drivers/nvme/host/nvme.h | 5 +++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dcc047f01a07..274635c0c02a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1250,6 +1250,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_EUI64_LEN; memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); return NVME_NIDT_EUI64_LEN; case NVME_NIDT_NGUID: @@ -1258,6 +1260,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_NGUID_LEN; memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); return NVME_NIDT_NGUID_LEN; case NVME_NIDT_UUID: @@ -1266,6 +1270,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_UUID_LEN; uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; case NVME_NIDT_CSI: @@ -1361,12 +1367,18 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, if ((*id)->ncap == 0) /* namespace not allocated or attached */ goto out_free_id; - if (ctrl->vs >= NVME_VS(1, 1, 0) && - !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); - if (ctrl->vs >= NVME_VS(1, 2, 0) && - !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { + dev_info(ctrl->device, + "Ignoring bogus Namespace Identifiers\n"); + } else { + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + } return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 94cee2c566d3..11d3cc2890f9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -151,6 +151,11 @@ enum nvme_quirks { * encoding the generation sequence number. */ NVME_QUIRK_SKIP_CID_GEN = (1 << 17), + + /* + * Reports garbage in the namespace identifiers (eui64, nguid, uuid). + */ + NVME_QUIRK_BOGUS_NID = (1 << 18), }; /* -- Gitee From 983009806cf6c01a28ac7890c08a83cc38e75bc8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jul 2022 17:39:02 +0800 Subject: [PATCH 065/132] nvme-pci: disable namespace identifiers for Qemu controllers stable inclusion from stable-v5.10.113 commit 7ec6e06ee405756f90226b946790a5a7a65795b7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7ec6e06ee405756f90226b946790a5a7a65795b7 -------------------------------- [ Upstream commit 66dd346b84d79fde20832ed691a54f4881eac20d ] Qemu unconditionally reports a UUID, which depending on the qemu version is either all-null (which is incorrect but harmless) or contains a single bit set for all controllers. In addition it can also optionally report a eui64 which needs to be manually set. Disable namespace identifiers for Qemu controlles entirely even if in some cases they could be set correctly through manual intervention. Reported-by: Luis Chamberlain Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/nvme/host/pci.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f435ab0809fb..e0a3d03198a2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3212,7 +3212,10 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS | - NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + NVME_QUIRK_DISABLE_WRITE_ZEROES | + NVME_QUIRK_BOGUS_NID, }, + { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ -- Gitee From 3d0cf64034acd4253f9d7e4e54c0ece407faa42c Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Tue, 26 Jul 2022 17:39:03 +0800 Subject: [PATCH 066/132] EDAC/synopsys: Read the error count from the correct register stable inclusion from stable-v5.10.113 commit 50cbc583fa838a63f8447251b88da569c3d36ba6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=50cbc583fa838a63f8447251b88da569c3d36ba6 -------------------------------- commit e2932d1f6f055b2af2114c7e64a26dc1b5593d0c upstream. Currently, the error count is read wrongly from the status register. Read the count from the proper error count register (ERRCNT). [ bp: Massage. ] Fixes: b500b4a029d5 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller") Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov Acked-by: Michal Simek Cc: Link: https://lore.kernel.org/r/20220414102813.4468-1-shubhrajyoti.datta@xilinx.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/edac/synopsys_edac.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 92906b56b1a2..fea44dc0484b 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -163,6 +163,11 @@ #define ECC_STAT_CECNT_SHIFT 8 #define ECC_STAT_BITNUM_MASK 0x7F +/* ECC error count register definitions */ +#define ECC_ERRCNT_UECNT_MASK 0xFFFF0000 +#define ECC_ERRCNT_UECNT_SHIFT 16 +#define ECC_ERRCNT_CECNT_MASK 0xFFFF + /* DDR QOS Interrupt register definitions */ #define DDR_QOS_IRQ_STAT_OFST 0x20200 #define DDR_QOSUE_MASK 0x4 @@ -418,15 +423,16 @@ static int zynqmp_get_error_info(struct synps_edac_priv *priv) base = priv->baseaddr; p = &priv->stat; + regval = readl(base + ECC_ERRCNT_OFST); + p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK; + p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT; + if (!p->ce_cnt) + goto ue_err; + regval = readl(base + ECC_STAT_OFST); if (!regval) return 1; - p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT; - p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT; - if (!p->ce_cnt) - goto ue_err; - p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK); regval = readl(base + ECC_CEADDR0_OFST); -- Gitee From eb179e5cfb436d79cb432f855a26e7596e24531e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 26 Jul 2022 17:39:04 +0800 Subject: [PATCH 067/132] mm, hugetlb: allow for "high" userspace addresses stable inclusion from stable-v5.10.113 commit 6b932920b96fc3002352fe8225ec63a1cd1717ec category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=6b932920b96fc3002352fe8225ec63a1cd1717ec -------------------------------- commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream. This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") for hugetlb. This patch adds support for "high" userspace addresses that are optionally supported on the system and have to be requested via a hint mechanism ("high" addr parameter to mmap). Architectures such as powerpc and x86 achieve this by making changes to their architectural versions of hugetlb_get_unmapped_area() function. However, arm64 uses the generic version of that function. So take into account arch_get_mmap_base() and arch_get_mmap_end() in hugetlb_get_unmapped_area(). To allow that, move those two macros out of mm/mmap.c into include/linux/sched/mm.h If these macros are not defined in architectural code then they default to (TASK_SIZE) and (base) so should not introduce any behavioural changes to architectures that do not define them. For the time being, only ARM64 is affected by this change. Catalin (ARM64) said "We should have fixed hugetlb_get_unmapped_area() as well when we added support for 52-bit VA. The reason for commit f6795053dac8 was to prevent normal mmap() from returning addresses above 48-bit by default as some user-space had hard assumptions about this. It's a slight ABI change if you do this for hugetlb_get_unmapped_area() but I doubt anyone would notice. It's more likely that the current behaviour would cause issues, so I'd rather have them consistent. Basically when arm64 gained support for 52-bit addresses we did not want user-space calling mmap() to suddenly get such high addresses, otherwise we could have inadvertently broken some programs (similar behaviour to x86 here). Hence we added commit f6795053dac8. But we missed hugetlbfs which could still get such high mmap() addresses. So in theory that's a potential regression that should have bee addressed at the same time as commit f6795053dac8 (and before arm64 enabled 52-bit addresses)" Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") Signed-off-by: Christophe Leroy Reviewed-by: Catalin Marinas Cc: Steve Capper Cc: Will Deacon Cc: [5.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Conflicts: fs/hugetlbfs/inode.c Acked-by: Xie XiuQi --- fs/hugetlbfs/inode.c | 9 +++++---- include/linux/sched/mm.h | 8 ++++++++ mm/mmap.c | 8 -------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6f2943465bff..8a87d1b43387 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -252,7 +252,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -272,7 +272,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -291,7 +291,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags); @@ -309,6 +309,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -328,7 +329,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index dc1f4dcd9a82..e3e5e149b00e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -106,6 +106,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long diff --git a/mm/mmap.c b/mm/mmap.c index 5ad32537604a..5489d70db84e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2404,14 +2404,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * -- Gitee From ccc63440f2c0b4d4473bfecbaf1a19e3c14afce7 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 26 Jul 2022 17:39:05 +0800 Subject: [PATCH 068/132] mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit 9ca66d79143980260be615b964b8dc1504a5d0c6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9ca66d79143980260be615b964b8dc1504a5d0c6 -------------------------------- commit 319561669a59d8e9206ab311ae5433ef92fd79d1 upstream. In some cases it is possible for mmu_interval_notifier_remove() to race with mn_tree_inv_end() allowing it to return while the notifier data structure is still in use. Consider the following sequence: CPU0 - mn_tree_inv_end() CPU1 - mmu_interval_notifier_remove() ----------------------------------- ------------------------------------ spin_lock(subscriptions->lock); seq = subscriptions->invalidate_seq; spin_lock(subscriptions->lock); spin_unlock(subscriptions->lock); subscriptions->invalidate_seq++; wait_event(invalidate_seq != seq); return; interval_tree_remove(interval_sub); kfree(interval_sub); spin_unlock(subscriptions->lock); wake_up_all(); As the wait_event() condition is true it will return immediately. This can lead to use-after-free type errors if the caller frees the data structure containing the interval notifier subscription while it is still on a deferred list. Fix this by taking the appropriate lock when reading invalidate_seq to ensure proper synchronisation. I observed this whilst running stress testing during some development. You do have to be pretty unlucky, but it leads to the usual problems of use-after-free (memory corruption, kernel crash, difficult to diagnose WARN_ON, etc). Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier") Signed-off-by: Alistair Popple Signed-off-by: Jason Gunthorpe Cc: Christian König Cc: John Hubbard Cc: Ralph Campbell Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- mm/mmu_notifier.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 07f42a7a6065..9165ca619c8c 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1043,6 +1043,18 @@ int mmu_interval_notifier_insert_locked( } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); +static bool +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, + unsigned long seq) +{ + bool ret; + + spin_lock(&subscriptions->lock); + ret = subscriptions->invalidate_seq != seq; + spin_unlock(&subscriptions->lock); + return ret; +} + /** * mmu_interval_notifier_remove - Remove a interval notifier * @interval_sub: Interval subscription to unregister @@ -1090,7 +1102,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) wait_event(subscriptions->wq, - READ_ONCE(subscriptions->invalidate_seq) != seq); + mmu_interval_seq_released(subscriptions, seq)); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); -- Gitee From 0affb0e98c917096fe752a041443b815bb2b61ba Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Tue, 26 Jul 2022 17:39:06 +0800 Subject: [PATCH 069/132] ata: pata_marvell: Check the 'bmdma_addr' beforing reading stable inclusion from stable-v5.10.113 commit cf23a960c5c62ddd8283a855a742977ee983a62a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cf23a960c5c62ddd8283a855a742977ee983a62a -------------------------------- commit aafa9f958342db36c17ac2a7f1b841032c96feb4 upstream. Before detecting the cable type on the dma bar, the driver should check whether the 'bmdma_addr' is zero, which means the adapter does not support DMA, otherwise we will get the following error: [ 5.146634] Bad IO access at port 0x1 (return inb(port)) [ 5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60 [ 5.150856] RIP: 0010:ioread8+0x4a/0x60 [ 5.160238] Call Trace: [ 5.160470] [ 5.160674] marvell_cable_detect+0x6e/0xc0 [pata_marvell] [ 5.161728] ata_eh_recover+0x3520/0x6cc0 [ 5.168075] ata_do_eh+0x49/0x3c0 Signed-off-by: Zheyu Ma Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/ata/pata_marvell.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c index b066809ba9a1..c56f4043b0cc 100644 --- a/drivers/ata/pata_marvell.c +++ b/drivers/ata/pata_marvell.c @@ -83,6 +83,8 @@ static int marvell_cable_detect(struct ata_port *ap) switch(ap->port_no) { case 0: + if (!ap->ioaddr.bmdma_addr) + return ATA_CBL_PATA_UNK; if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1) return ATA_CBL_PATA40; return ATA_CBL_PATA80; -- Gitee From d5e4b0822bc88bcd32070ab02176509d7a9eb06a Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Tue, 26 Jul 2022 17:39:07 +0800 Subject: [PATCH 070/132] dma: at_xdmac: fix a missing check on list iterator stable inclusion from stable-v5.10.113 commit 358a3846f6a950b8db803262e74081ad4ab235d4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=358a3846f6a950b8db803262e74081ad4ab235d4 -------------------------------- commit 206680c4e46b62fd8909385e0874a36952595b85 upstream. The bug is here: __func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue); The list iterator 'desc' will point to a bogus position containing HEAD if the list is empty or no element is found. To avoid dev_dbg() prints a invalid address, use a new variable 'iter' as the list iterator, while use the origin variable 'desc' as a dedicated pointer to point to the found element. Cc: stable@vger.kernel.org Fixes: 82e2424635f4c ("dmaengine: xdmac: fix print warning on dma_addr_t variable") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220327061154.4867-1-xiam0nd.tong@gmail.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/dma/at_xdmac.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 90afba0b36fe..47552db6b8dc 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1390,7 +1390,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, { struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); struct at_xdmac *atxdmac = to_at_xdmac(atchan->chan.device); - struct at_xdmac_desc *desc, *_desc; + struct at_xdmac_desc *desc, *_desc, *iter; struct list_head *descs_list; enum dma_status ret; int residue, retry; @@ -1505,11 +1505,13 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, * microblock. */ descs_list = &desc->descs_list; - list_for_each_entry_safe(desc, _desc, descs_list, desc_node) { - dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg); - residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth; - if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda) + list_for_each_entry_safe(iter, _desc, descs_list, desc_node) { + dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg); + residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth; + if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) { + desc = iter; break; + } } residue += cur_ubc << dwidth; -- Gitee From 2d19822cbd21ac865ee8a7dc3376e640a457ee1f Mon Sep 17 00:00:00 2001 From: Manuel Ullmann Date: Tue, 26 Jul 2022 17:39:08 +0800 Subject: [PATCH 071/132] net: atlantic: invert deep par in pm functions, preventing null derefs stable inclusion from stable-v5.10.113 commit ba2716da233618c3f361b8ece818a0e91de7a8f0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ba2716da233618c3f361b8ece818a0e91de7a8f0 -------------------------------- commit cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c upstream. This will reset deeply on freeze and thaw instead of suspend and resume and prevent null pointer dereferences of the uninitialized ring 0 buffer while thawing. The impact is an indefinitely hanging kernel. You can't switch consoles after this and the only possible user interaction is SysRq. BUG: kernel NULL pointer dereference RIP: 0010:aq_ring_rx_fill+0xcf/0x210 [atlantic] aq_vec_init+0x85/0xe0 [atlantic] aq_nic_init+0xf7/0x1d0 [atlantic] atl_resume_common+0x4f/0x100 [atlantic] pci_pm_thaw+0x42/0xa0 resolves in aq_ring.o to ``` 0000000000000ae0 : { /* ... */ baf: 48 8b 43 08 mov 0x8(%rbx),%rax buff->flags = 0U; /* buff is NULL */ ``` The bug has been present since the introduction of the new pm code in 8aaa112a57c1 ("net: atlantic: refactoring pm logic") and was hidden until 8ce84271697a ("net: atlantic: changes for multi-TC support"), which refactored the aq_vec_{free,alloc} functions into aq_vec_{,ring}_{free,alloc}, but is technically not wrong. The original functions just always reinitialized the buffers on S3/S4. If the interface is down before freezing, the bug does not occur. It does not matter, whether the initrd contains and loads the module before thawing. So the fix is to invert the boolean parameter deep in all pm function calls, which was clearly intended to be set like that. First report was on Github [1], which you have to guess from the resume logs in the posted dmesg snippet. Recently I posted one on Bugzilla [2], since I did not have an AQC device so far. #regzbot introduced: 8ce84271697a #regzbot from: koo5 #regzbot monitor: https://github.com/Aquantia/AQtion/issues/32 Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic") Link: https://github.com/Aquantia/AQtion/issues/32 [1] Link: https://bugzilla.kernel.org/show_bug.cgi?id=215798 [2] Cc: stable@vger.kernel.org Reported-by: koo5 Signed-off-by: Manuel Ullmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index 1826253f97dc..bdfd462c74db 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -450,22 +450,22 @@ static int atl_resume_common(struct device *dev, bool deep) static int aq_pm_freeze(struct device *dev) { - return aq_suspend_common(dev, false); + return aq_suspend_common(dev, true); } static int aq_pm_suspend_poweroff(struct device *dev) { - return aq_suspend_common(dev, true); + return aq_suspend_common(dev, false); } static int aq_pm_thaw(struct device *dev) { - return atl_resume_common(dev, false); + return atl_resume_common(dev, true); } static int aq_pm_resume_restore(struct device *dev) { - return atl_resume_common(dev, true); + return atl_resume_common(dev, false); } static const struct dev_pm_ops aq_pm_ops = { -- Gitee From e85467a78402f38cdbc4a15730731610dd8aa195 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 26 Jul 2022 17:39:09 +0800 Subject: [PATCH 072/132] xtensa: patch_text: Fixup last cpu should be master stable inclusion from stable-v5.10.113 commit f399ab11dd6cc0a6927029371fbe2ec027181598 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f399ab11dd6cc0a6927029371fbe2ec027181598 -------------------------------- commit ee69d4be8fd064cd08270b4808d2dfece3614ee0 upstream. These patch_text implementations are using stop_machine_cpuslocked infrastructure with atomic cpu_count. The original idea: When the master CPU patch_text, the others should wait for it. But current implementation is using the first CPU as master, which couldn't guarantee the remaining CPUs are waiting. This patch changes the last CPU as the master to solve the potential risk. Fixes: 64711f9a47d4 ("xtensa: implement jump_label support") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Max Filippov Reviewed-by: Masami Hiramatsu Cc: Message-Id: <20220407073323.743224-4-guoren@kernel.org> Signed-off-by: Max Filippov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/xtensa/kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/kernel/jump_label.c b/arch/xtensa/kernel/jump_label.c index 0dde21e0d3de..ad1841cecdfb 100644 --- a/arch/xtensa/kernel/jump_label.c +++ b/arch/xtensa/kernel/jump_label.c @@ -40,7 +40,7 @@ static int patch_text_stop_machine(void *data) { struct patch *patch = data; - if (atomic_inc_return(&patch->cpu_count) == 1) { + if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { local_patch_text(patch->addr, patch->data, patch->sz); atomic_inc(&patch->cpu_count); } else { -- Gitee From 90fa436f8a1afa90f01292c96ed7813fe10371d5 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 26 Jul 2022 17:39:10 +0800 Subject: [PATCH 073/132] xtensa: fix a7 clobbering in coprocessor context load/store stable inclusion from stable-v5.10.113 commit 19f6dcb1f0f0f8523976c8aa1800856c9b4f35c3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19f6dcb1f0f0f8523976c8aa1800856c9b4f35c3 -------------------------------- commit 839769c35477d4acc2369e45000ca7b0b6af39a7 upstream. Fast coprocessor exception handler saves a3..a6, but coprocessor context load/store code uses a4..a7 as temporaries, potentially clobbering a7. 'Potentially' because coprocessor state load/store macros may not use all four temporary registers (and neither FPU nor HiFi macros do). Use a3..a6 as intended. Cc: stable@vger.kernel.org Fixes: c658eac628aa ("[XTENSA] Add support for configurable registers and coprocessors") Signed-off-by: Max Filippov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/xtensa/kernel/coprocessor.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index 45cc0ae0af6f..c7b9f12896f2 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -29,7 +29,7 @@ .if XTENSA_HAVE_COPROCESSOR(x); \ .align 4; \ .Lsave_cp_regs_cp##x: \ - xchal_cp##x##_store a2 a4 a5 a6 a7; \ + xchal_cp##x##_store a2 a3 a4 a5 a6; \ jx a0; \ .endif @@ -46,7 +46,7 @@ .if XTENSA_HAVE_COPROCESSOR(x); \ .align 4; \ .Lload_cp_regs_cp##x: \ - xchal_cp##x##_load a2 a4 a5 a6 a7; \ + xchal_cp##x##_load a2 a3 a4 a5 a6; \ jx a0; \ .endif -- Gitee From 490085c27ad981285d7f65b677a78bd578084e98 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Tue, 26 Jul 2022 17:39:11 +0800 Subject: [PATCH 074/132] openvswitch: fix OOB access in reserve_sfa_size() stable inclusion from stable-v5.10.113 commit 0837ff17d052b7d755d5086208c3445867aaff82 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0837ff17d052b7d755d5086208c3445867aaff82 -------------------------------- commit cefa91b2332d7009bc0be5d951d6cbbf349f90f8 upstream. Given a sufficiently large number of actions, while copying and reserving memory for a new action of a new flow, if next_offset is greater than MAX_ACTIONS_BUFSIZE, the function reserve_sfa_size() does not return -EMSGSIZE as expected, but it allocates MAX_ACTIONS_BUFSIZE bytes increasing actions_len by req_size. This can then lead to an OOB write access, especially when further actions need to be copied. Fix it by rearranging the flow action size check. KASAN splat below: Acked-by: Xie XiuQi ================================================================== BUG: KASAN: slab-out-of-bounds in reserve_sfa_size+0x1ba/0x380 [openvswitch] Write of size 65360 at addr ffff888147e4001c by task handler15/836 CPU: 1 PID: 836 Comm: handler15 Not tainted 5.18.0-rc1+ #27 ... Call Trace: dump_stack_lvl+0x45/0x5a print_report.cold+0x5e/0x5db ? __lock_text_start+0x8/0x8 ? reserve_sfa_size+0x1ba/0x380 [openvswitch] kasan_report+0xb5/0x130 ? reserve_sfa_size+0x1ba/0x380 [openvswitch] kasan_check_range+0xf5/0x1d0 memcpy+0x39/0x60 reserve_sfa_size+0x1ba/0x380 [openvswitch] __add_action+0x24/0x120 [openvswitch] ovs_nla_add_action+0xe/0x20 [openvswitch] ovs_ct_copy_action+0x29d/0x1130 [openvswitch] ? __kernel_text_address+0xe/0x30 ? unwind_get_return_address+0x56/0xa0 ? create_prof_cpu_mask+0x20/0x20 ? ovs_ct_verify+0xf0/0xf0 [openvswitch] ? prep_compound_page+0x198/0x2a0 ? __kasan_check_byte+0x10/0x40 ? kasan_unpoison+0x40/0x70 ? ksize+0x44/0x60 ? reserve_sfa_size+0x75/0x380 [openvswitch] __ovs_nla_copy_actions+0xc26/0x2070 [openvswitch] ? __zone_watermark_ok+0x420/0x420 ? validate_set.constprop.0+0xc90/0xc90 [openvswitch] ? __alloc_pages+0x1a9/0x3e0 ? __alloc_pages_slowpath.constprop.0+0x1da0/0x1da0 ? unwind_next_frame+0x991/0x1e40 ? __mod_node_page_state+0x99/0x120 ? __mod_lruvec_page_state+0x2e3/0x470 ? __kasan_kmalloc_large+0x90/0xe0 ovs_nla_copy_actions+0x1b4/0x2c0 [openvswitch] ovs_flow_cmd_new+0x3cd/0xb10 [openvswitch] ... Cc: stable@vger.kernel.org Fixes: f28cd2af22a0 ("openvswitch: fix flow actions reallocation") Signed-off-by: Paolo Valerio Acked-by: Eelco Chaudron Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 98a7e6f64ab0..293a798e89f4 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2436,7 +2436,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) { OVS_NLERR(log, "Flow action size exceeds max %u", MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); -- Gitee From 140cdaa1ebf8c1c01a60b1f56522154c58dc71ce Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 26 Jul 2022 17:39:12 +0800 Subject: [PATCH 075/132] gpio: Request interrupts after IRQ is initialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.113 commit 54e6180c8c2d71b6f1fab0d1420f65299b26953e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=54e6180c8c2d71b6f1fab0d1420f65299b26953e -------------------------------- commit 06fb4ecfeac7e00d6704fa5ed19299f2fefb3cc9 upstream. Commit 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization") attempted to fix a race condition that lead to a NULL pointer, but in the process caused a regression for _AEI/_EVT declared GPIOs. This manifests in messages showing deferred probing while trying to allocate IRQs like so: amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x0000 to IRQ, err -517 amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x002C to IRQ, err -517 amd_gpio AMDI0030:00: Failed to translate GPIO pin 0x003D to IRQ, err -517 [ .. more of the same .. ] The code for walking _AEI doesn't handle deferred probing and so this leads to non-functional GPIO interrupts. Fix this issue by moving the call to `acpi_gpiochip_request_interrupts` to occur after gc->irc.initialized is set. Fixes: 5467801f1fcb ("gpio: Restrict usage of GPIO chip irq members before initialization") Link: https://lore.kernel.org/linux-gpio/BL1PR12MB51577A77F000A008AA694675E2EF9@BL1PR12MB5157.namprd12.prod.outlook.com/ Link: https://bugzilla.suse.com/show_bug.cgi?id=1198697 Link: https://bugzilla.kernel.org/show_bug.cgi?id=215850 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1979 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1976 Reported-by: Mario Limonciello Signed-off-by: Mario Limonciello Reviewed-by: Shreeya Patel Tested-By: Samuel Čavoj Tested-By: lukeluk498@gmail.com Link: Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Reviewed-and-tested-by: Takashi Iwai Cc: Shreeya Patel Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpio/gpiolib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index d18078748200..59d8affad343 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1612,8 +1612,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, gpiochip_set_irq_hooks(gc); - acpi_gpiochip_request_interrupts(gc); - /* * Using barrier() here to prevent compiler from reordering * gc->irq.initialized before initialization of above @@ -1623,6 +1621,8 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, gc->irq.initialized = true; + acpi_gpiochip_request_interrupts(gc); + return 0; } -- Gitee From 9d7cda5f4e3dc4ab3acf27cd75d78b156a0cc099 Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Tue, 26 Jul 2022 17:39:13 +0800 Subject: [PATCH 076/132] ASoC: soc-dapm: fix two incorrect uses of list iterator stable inclusion from stable-v5.10.113 commit 43a2a3734aa3d25ef11b90f14f18bf31fb0a7feb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=43a2a3734aa3d25ef11b90f14f18bf31fb0a7feb -------------------------------- commit f730a46b931d894816af34a0ff8e4ad51565b39f upstream. These two bug are here: list_for_each_entry_safe_continue(w, n, list, power_list); list_for_each_entry_safe_continue(w, n, list, power_list); After the list_for_each_entry_safe_continue() exits, the list iterator will always be a bogus pointer which point to an invalid struct objdect containing HEAD member. The funciton poniter 'w->event' will be a invalid value which can lead to a control-flow hijack if the 'w' can be controlled. The original intention was to continue the outer list_for_each_entry_safe() loop with the same entry if w->event is NULL, but misunderstanding the meaning of list_for_each_entry_safe_continue(). So just add a 'continue;' to fix the bug. Cc: stable@vger.kernel.org Fixes: 163cac061c973 ("ASoC: Factor out DAPM sequence execution") Signed-off-by: Xiaomeng Tong Link: https://lore.kernel.org/r/20220329012134.9375-1-xiam0nd.tong@gmail.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- sound/soc/soc-dapm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 2924d89bf0da..417732bdf286 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1683,8 +1683,7 @@ static void dapm_seq_run(struct snd_soc_card *card, switch (w->id) { case snd_soc_dapm_pre: if (!w->event) - list_for_each_entry_safe_continue(w, n, list, - power_list); + continue; if (event == SND_SOC_DAPM_STREAM_START) ret = w->event(w, @@ -1696,8 +1695,7 @@ static void dapm_seq_run(struct snd_soc_card *card, case snd_soc_dapm_post: if (!w->event) - list_for_each_entry_safe_continue(w, n, list, - power_list); + continue; if (event == SND_SOC_DAPM_STREAM_START) ret = w->event(w, -- Gitee From 2e5f28ee0f8c9876e8af1a9347aee57594f8478d Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 26 Jul 2022 17:39:14 +0800 Subject: [PATCH 077/132] e1000e: Fix possible overflow in LTR decoding stable inclusion from stable-v5.10.113 commit 7082650eb8265b583d0aea0515bcc6f65b0e8755 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7082650eb8265b583d0aea0515bcc6f65b0e8755 -------------------------------- commit 04ebaa1cfddae5f240cc7404f009133bb0389a47 upstream. When we decode the latency and the max_latency, u16 value may not fit the required size and could lead to the wrong LTR representation. Scaling is represented as: scale 0 - 1 (2^(5*0)) = 2^0 scale 1 - 32 (2^(5 *1))= 2^5 scale 2 - 1024 (2^(5 *2)) =2^10 scale 3 - 32768 (2^(5 *3)) =2^15 scale 4 - 1048576 (2^(5 *4)) = 2^20 scale 5 - 33554432 (2^(5 *4)) = 2^25 scale 4 and scale 5 required 20 and 25 bits respectively. scale 6 reserved. Replace the u16 type with the u32 type and allow corrected LTR representation. Cc: stable@vger.kernel.org Fixes: 44a13a5d99c7 ("e1000e: Fix the max snoop/no-snoop latency for 10M") Reported-by: James Hutchinson Link: https://bugzilla.kernel.org/show_bug.cgi?id=215689 Suggested-by: Dima Ruinskiy Signed-off-by: Sasha Neftin Tested-by: Naama Meir Tested-by: James Hutchinson Signed-off-by: Tony Nguyen Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 15b1503d5b6c..1f51252b465a 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1006,8 +1006,8 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link) { u32 reg = link << (E1000_LTRV_REQ_SHIFT + E1000_LTRV_NOSNOOP_SHIFT) | link << E1000_LTRV_REQ_SHIFT | E1000_LTRV_SEND; - u16 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ - u16 lat_enc_d = 0; /* latency decoded */ + u32 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ + u32 lat_enc_d = 0; /* latency decoded */ u16 lat_enc = 0; /* latency encoded */ if (link) { -- Gitee From d51577c959b12c0ff173292fd7ebc9090ed20a29 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Tue, 26 Jul 2022 17:39:15 +0800 Subject: [PATCH 078/132] ARC: entry: fix syscall_trace_exit argument stable inclusion from stable-v5.10.113 commit 5580b974a84b30f6da90a3a562ea0dbfb0038110 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5580b974a84b30f6da90a3a562ea0dbfb0038110 -------------------------------- commit b1c6ecfdd06907554518ec384ce8e99889d15193 upstream. Function syscall_trace_exit expects pointer to pt_regs. However r0 is also used to keep syscall return value. Restore pointer to pt_regs before calling syscall_trace_exit. Cc: Signed-off-by: Sergey Matyukevich Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/arc/kernel/entry.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index ae656bfc31c3..301ade4d0b94 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -199,6 +199,7 @@ tracesys_exit: st r0, [sp, PT_r0] ; sys call return value in pt_regs ;POST Sys Call Ptrace Hook + mov r0, sp ; pt_regs needed bl @syscall_trace_exit b ret_from_exception ; NOT ret_from_system_call at is saves r0 which ; we'd done before calling post hook above -- Gitee From 1e6c493076318775eed92b3e3fe887ab4dfde197 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 26 Jul 2022 17:39:16 +0800 Subject: [PATCH 079/132] arm_pmu: Validate single/group leader events stable inclusion from stable-v5.10.113 commit c55327bc3712ca96dc05bc4c6aab805b6fa8bb74 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c55327bc3712ca96dc05bc4c6aab805b6fa8bb74 -------------------------------- commit e5c23779f93d45e39a52758ca593bd7e62e9b4be upstream. In the case where there is only a cycle counter available (i.e. PMCR_EL0.N is 0) and an event other than CPU cycles is opened, the open should fail as the event can never possibly be scheduled. However, the event validation when an event is opened is skipped when the group leader is opened. Fix this by always validating the group leader events. Reported-by: Al Grant Cc: Will Deacon Cc: Mark Rutland Signed-off-by: Rob Herring Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20220408203330.4014015-1-robh@kernel.org Cc: Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/perf/arm_pmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index fe075d9f95e2..c87faafbdba2 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -398,6 +398,9 @@ validate_group(struct perf_event *event) if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; + if (event == leader) + return 0; + for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; @@ -487,12 +490,7 @@ __hw_perf_event_init(struct perf_event *event) local64_set(&hwc->period_left, hwc->sample_period); } - if (event->group_leader != event) { - if (validate_group(event) != 0) - return -EINVAL; - } - - return 0; + return validate_group(event); } static int armpmu_event_init(struct perf_event *event) -- Gitee From 8109e1614b619b8cb7791a76ca932a6e9da496d8 Mon Sep 17 00:00:00 2001 From: kuyo chang Date: Tue, 26 Jul 2022 17:39:17 +0800 Subject: [PATCH 080/132] sched/pelt: Fix attach_entity_load_avg() corner case stable inclusion from stable-v5.10.113 commit 88fcfd6ee6c5a617e712b346e9c15fc3057e532e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=88fcfd6ee6c5a617e712b346e9c15fc3057e532e -------------------------------- [ Upstream commit 40f5aa4c5eaebfeaca4566217cb9c468e28ed682 ] The warning in cfs_rq_is_decayed() triggered: SCHED_WARN_ON(cfs_rq->avg.load_avg || cfs_rq->avg.util_avg || cfs_rq->avg.runnable_avg) There exists a corner case in attach_entity_load_avg() which will cause load_sum to be zero while load_avg will not be. Consider se_weight is 88761 as per the sched_prio_to_weight[] table. Further assume the get_pelt_divider() is 47742, this gives: se->avg.load_avg is 1. However, calculating load_sum: se->avg.load_sum = div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); se->avg.load_sum = 1*47742/88761 = 0. Then enqueue_load_avg() adds this to the cfs_rq totals: cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; Resulting in load_avg being 1 with load_sum is 0, which will trigger the WARN. Fixes: f207934fb79d ("sched/fair: Align PELT windows between cfs_rq and its se") Signed-off-by: kuyo chang [peterz: massage changelog] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20220414090229.342-1-kuyo.chang@mediatek.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50d457979db6..09f002f1fe5d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3771,11 +3771,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; -- Gitee From 47edbfbf9e4c8b189615ea817ffc52e0dc011aed Mon Sep 17 00:00:00 2001 From: Zhipeng Xie Date: Tue, 26 Jul 2022 17:39:18 +0800 Subject: [PATCH 081/132] perf/core: Fix perf_mmap fail when CONFIG_PERF_USE_VMALLOC enabled stable inclusion from stable-v5.10.113 commit 51d9cbbb0f5a175e5b4c4a25d2ec995363304860 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=51d9cbbb0f5a175e5b4c4a25d2ec995363304860 -------------------------------- [ Upstream commit 60490e7966659b26d74bf1fa4aa8693d9a94ca88 ] This problem can be reproduced with CONFIG_PERF_USE_VMALLOC enabled on both x86_64 and aarch64 arch when using sysdig -B(using ebpf)[1]. sysdig -B works fine after rebuilding the kernel with CONFIG_PERF_USE_VMALLOC disabled. I tracked it down to the if condition event->rb->nr_pages != nr_pages in perf_mmap is true when CONFIG_PERF_USE_VMALLOC is enabled where event->rb->nr_pages = 1 and nr_pages = 2048 resulting perf_mmap to return -EINVAL. This is because when CONFIG_PERF_USE_VMALLOC is enabled, rb->nr_pages is always equal to 1. Arch with CONFIG_PERF_USE_VMALLOC enabled by default: arc/arm/csky/mips/sh/sparc/xtensa Arch with CONFIG_PERF_USE_VMALLOC disabled by default: x86_64/aarch64/... Fix this problem by using data_page_nr() [1] https://github.com/draios/sysdig Fixes: 906010b2134e ("perf_event: Provide vmalloc() based mmap() backing") Signed-off-by: Zhipeng Xie Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220209145417.6495-1-xiezhipeng1@huawei.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- kernel/events/core.c | 2 +- kernel/events/internal.h | 5 +++++ kernel/events/ring_buffer.c | 5 ----- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4bd9dd6c3b72..68dc8a8e7990 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6174,7 +6174,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 228801e20788..aa23ffdaf819 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb) } #endif +static inline int data_page_nr(struct perf_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} + static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ef91ae75ca56..4032cd475000 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -856,11 +856,6 @@ void rb_free(struct perf_buffer *rb) } #else -static int data_page_nr(struct perf_buffer *rb) -{ - return rb->nr_pages << page_order(rb); -} - static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { -- Gitee From 5f70f8f020405ec27343ea6c8c7f580922443b4a Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Tue, 26 Jul 2022 17:39:19 +0800 Subject: [PATCH 082/132] drm/panel/raspberrypi-touchscreen: Avoid NULL deref if not initialised stable inclusion from stable-v5.10.113 commit 231381f5211620cec836b921f1c7a2cf702b3e8a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=231381f5211620cec836b921f1c7a2cf702b3e8a -------------------------------- [ Upstream commit f92055ae0acb035891e988ce345d6b81a0316423 ] If a call to rpi_touchscreen_i2c_write from rpi_touchscreen_probe fails before mipi_dsi_device_register_full is called, then in trying to log the error message if uses ts->dsi->dev when it is still NULL. Use ts->i2c->dev instead, which is initialised earlier in probe. Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.") Signed-off-by: Dave Stevenson Signed-off-by: Stefan Wahren Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-2-stefan.wahren@i2se.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c index bbdd086be7f5..90487df62480 100644 --- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c +++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c @@ -229,7 +229,7 @@ static void rpi_touchscreen_i2c_write(struct rpi_touchscreen *ts, ret = i2c_smbus_write_byte_data(ts->i2c, reg, val); if (ret) - dev_err(&ts->dsi->dev, "I2C write failed: %d\n", ret); + dev_err(&ts->i2c->dev, "I2C write failed: %d\n", ret); } static int rpi_touchscreen_write(struct rpi_touchscreen *ts, u16 reg, u32 val) -- Gitee From 27f402d6441a30ef1541f3bca32429aceb1ce8de Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Tue, 26 Jul 2022 17:39:20 +0800 Subject: [PATCH 083/132] drm/panel/raspberrypi-touchscreen: Initialise the bridge in prepare stable inclusion from stable-v5.10.113 commit 405d98427416849cf37c84c0c70bd5008b686a1e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=405d98427416849cf37c84c0c70bd5008b686a1e -------------------------------- [ Upstream commit 5f18c0782b99e26121efa93d20b76c19e17aa1dd ] The panel has a prepare call which is before video starts, and an enable call which is after. The Toshiba bridge should be configured before video, so move the relevant power and initialisation calls to prepare. Fixes: 2f733d6194bd ("drm/panel: Add support for the Raspberry Pi 7" Touchscreen.") Signed-off-by: Dave Stevenson Signed-off-by: Stefan Wahren Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220415162513.42190-3-stefan.wahren@i2se.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c index 90487df62480..4b92c6341490 100644 --- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c +++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c @@ -265,7 +265,7 @@ static int rpi_touchscreen_noop(struct drm_panel *panel) return 0; } -static int rpi_touchscreen_enable(struct drm_panel *panel) +static int rpi_touchscreen_prepare(struct drm_panel *panel) { struct rpi_touchscreen *ts = panel_to_ts(panel); int i; @@ -295,6 +295,13 @@ static int rpi_touchscreen_enable(struct drm_panel *panel) rpi_touchscreen_write(ts, DSI_STARTDSI, 0x01); msleep(100); + return 0; +} + +static int rpi_touchscreen_enable(struct drm_panel *panel) +{ + struct rpi_touchscreen *ts = panel_to_ts(panel); + /* Turn on the backlight. */ rpi_touchscreen_i2c_write(ts, REG_PWM, 255); @@ -349,7 +356,7 @@ static int rpi_touchscreen_get_modes(struct drm_panel *panel, static const struct drm_panel_funcs rpi_touchscreen_funcs = { .disable = rpi_touchscreen_disable, .unprepare = rpi_touchscreen_noop, - .prepare = rpi_touchscreen_noop, + .prepare = rpi_touchscreen_prepare, .enable = rpi_touchscreen_enable, .get_modes = rpi_touchscreen_get_modes, }; -- Gitee From d4248c33b8a3403ca22cce4319412763975d9b9f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 26 Jul 2022 17:39:21 +0800 Subject: [PATCH 084/132] KVM: PPC: Fix TCE handling for VFIO stable inclusion from stable-v5.10.113 commit f8f8b3124b899867a18a3f63e538c791e21252ac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f8f8b3124b899867a18a3f63e538c791e21252ac -------------------------------- [ Upstream commit 26a62b750a4e6364b0393562f66759b1494c3a01 ] The LoPAPR spec defines a guest visible IOMMU with a variable page size. Currently QEMU advertises 4K, 64K, 2M, 16MB pages, a Linux VM picks the biggest (16MB). In the case of a passed though PCI device, there is a hardware IOMMU which does not support all pages sizes from the above - P8 cannot do 2MB and P9 cannot do 16MB. So for each emulated 16M IOMMU page we may create several smaller mappings ("TCEs") in the hardware IOMMU. The code wrongly uses the emulated TCE index instead of hardware TCE index in error handling. The problem is easier to see on POWER8 with multi-level TCE tables (when only the first level is preallocated) as hash mode uses real mode TCE hypercalls handlers. The kernel starts using indirect tables when VMs get bigger than 128GB (depends on the max page order). The very first real mode hcall is going to fail with H_TOO_HARD as in the real mode we cannot allocate memory for TCEs (we can in the virtual mode) but on the way out the code attempts to clear hardware TCEs using emulated TCE indexes which corrupts random kernel memory because it_offset==1<<59 is subtracted from those indexes and the resulting index is out of the TCE table bounds. This fixes kvmppc_clear_tce() to use the correct TCE indexes. While at it, this fixes TCE cache invalidation which uses emulated TCE indexes instead of the hardware ones. This went unnoticed as 64bit DMA is used these days and VMs map all RAM in one go and only then do DMA and this is when the TCE cache gets populated. Potentially this could slow down mapping, however normally 16MB emulated pages are backed by 64K hardware pages so it is one write to the "TCE Kill" per 256 updates which is not that bad considering the size of the cache (1024 TCEs or so). Fixes: ca1fc489cfa0 ("KVM: PPC: Book3S: Allow backing bigger guest IOMMU pages with smaller physical pages") Signed-off-by: Alexey Kardashevskiy Tested-by: David Gibson Reviewed-by: Frederic Barrat Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220420050840.328223-1-aik@ozlabs.ru Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/powerpc/kvm/book3s_64_vio.c | 45 +++++++++++++++-------------- arch/powerpc/kvm/book3s_64_vio_hv.c | 44 ++++++++++++++-------------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8da93fdfa59e..c640053ab03f 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -421,13 +421,19 @@ static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, tbl[idx % TCES_PER_PAGE] = tce; } -static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, - unsigned long entry) +static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt, + struct iommu_table *tbl, unsigned long entry) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; + unsigned long i; + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); + + for (i = 0; i < subpages; ++i) { + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir); + } } static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, @@ -486,6 +492,8 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm, break; } + iommu_tce_kill(tbl, io_entry, subpages); + return ret; } @@ -545,6 +553,8 @@ static long kvmppc_tce_iommu_map(struct kvm *kvm, break; } + iommu_tce_kill(tbl, io_entry, subpages); + return ret; } @@ -591,10 +601,9 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - iommu_tce_kill(stit->tbl, entry, 1); if (ret != H_SUCCESS) { - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry); goto unlock_exit; } } @@ -670,13 +679,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ if (get_user(tce, tces + i)) { ret = H_TOO_HARD; - goto invalidate_exit; + goto unlock_exit; } tce = be64_to_cpu(tce); if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto invalidate_exit; + goto unlock_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -685,19 +694,15 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, - entry); - goto invalidate_exit; + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, + entry + i); + goto unlock_exit; } } kvmppc_tce_put(stt, entry + i, tce); } -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill(stit->tbl, entry, npages); - unlock_exit: srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -736,20 +741,16 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - goto invalidate_exit; + return ret; WARN_ON_ONCE(1); - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i); } } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages); - return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index e5ba96c41f3f..57af53a6a2d8 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -247,13 +247,19 @@ static void iommu_tce_kill_rm(struct iommu_table *tbl, tbl->it_ops->tce_kill(tbl, entry, pages, true); } -static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, - unsigned long entry) +static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt, + struct iommu_table *tbl, unsigned long entry) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; + unsigned long i; + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); + + for (i = 0; i < subpages; ++i) { + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir); + } } static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, @@ -316,6 +322,8 @@ static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, break; } + iommu_tce_kill_rm(tbl, io_entry, subpages); + return ret; } @@ -379,6 +387,8 @@ static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, break; } + iommu_tce_kill_rm(tbl, io_entry, subpages); + return ret; } @@ -424,10 +434,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - iommu_tce_kill_rm(stit->tbl, entry, 1); - if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry); return ret; } } @@ -569,7 +577,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ua = 0; if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto invalidate_exit; + goto unlock_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -578,19 +586,15 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, - entry); - goto invalidate_exit; + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, + entry + i); + goto unlock_exit; } } kvmppc_rm_tce_put(stt, entry + i, tce); } -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill_rm(stit->tbl, entry, npages); - unlock_exit: if (!prereg) arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); @@ -632,20 +636,16 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - goto invalidate_exit; + return ret; WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i); } } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages); - return ret; } -- Gitee From c0d651a41c25108ae1ada52d0b1f1a3e3020d53f Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 26 Jul 2022 17:39:22 +0800 Subject: [PATCH 085/132] drm/vc4: Use pm_runtime_resume_and_get to fix pm_runtime_get_sync() usage stable inclusion from stable-v5.10.113 commit 0a2cef65b32919af7df4df979c5eede5f7825f17 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0a2cef65b32919af7df4df979c5eede5f7825f17 -------------------------------- [ Upstream commit 3d0b93d92a2790337aa9d18cb332d02356a24126 ] If the device is already in a runtime PM enabled state pm_runtime_get_sync() will return 1. Also, we need to call pm_runtime_put_noidle() when pm_runtime_get_sync() fails, so use pm_runtime_resume_and_get() instead. this function will handle this. Fixes: 4078f5757144 ("drm/vc4: Add DSI driver") Signed-off-by: Miaoqian Lin Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220420135008.2757-1-linmq006@gmail.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/gpu/drm/vc4/vc4_dsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_dsi.c b/drivers/gpu/drm/vc4/vc4_dsi.c index eaf276978ee7..ad84b56f4091 100644 --- a/drivers/gpu/drm/vc4/vc4_dsi.c +++ b/drivers/gpu/drm/vc4/vc4_dsi.c @@ -835,7 +835,7 @@ static void vc4_dsi_encoder_enable(struct drm_encoder *encoder) unsigned long phy_clock; int ret; - ret = pm_runtime_get_sync(dev); + ret = pm_runtime_resume_and_get(dev); if (ret) { DRM_ERROR("Failed to runtime PM enable on DSI%d\n", dsi->port); return; -- Gitee From a0820bde54b9a6f88f03400828dd5f0fdfa2051f Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Tue, 26 Jul 2022 17:39:23 +0800 Subject: [PATCH 086/132] powerpc/perf: Fix power9 event alternatives stable inclusion from stable-v5.10.113 commit e012f9d1af54ca3c24ca0e9ec03a1a212972771c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e012f9d1af54ca3c24ca0e9ec03a1a212972771c -------------------------------- [ Upstream commit 0dcad700bb2776e3886fe0a645a4bf13b1e747cd ] When scheduling a group of events, there are constraint checks done to make sure all events can go in a group. Example, one of the criteria is that events in a group cannot use the same PMC. But platform specific PMU supports alternative event for some of the event codes. During perf_event_open(), if any event group doesn't match constraint check criteria, further lookup is done to find alternative event. By current design, the array of alternatives events in PMU code is expected to be sorted by column 0. This is because in find_alternative() the return criteria is based on event code comparison. ie. "event < ev_alt[i][0])". This optimisation is there since find_alternative() can be called multiple times. In power9 PMU code, the alternative event array is not sorted properly and hence there is breakage in finding alternative events. To work with existing logic, fix the alternative event array to be sorted by column 0 for power9-pmu.c Results: With alternative events, multiplexing can be avoided. That is, for example, in power9 PM_LD_MISS_L1 (0x3e054) has alternative event, PM_LD_MISS_L1_ALT (0x400f0). This is an identical event which can be programmed in a different PMC. Before: # perf stat -e r3e054,r300fc Performance counter stats for 'system wide': 1057860 r3e054 (50.21%) 379 r300fc (49.79%) 0.944329741 seconds time elapsed Since both the events are using PMC3 in this case, they are multiplexed here. After: # perf stat -e r3e054,r300fc Performance counter stats for 'system wide': 1006948 r3e054 182 r300fc Fixes: 91e0bd1e6251 ("powerpc/perf: Add PM_LD_MISS_L1 and PM_BR_2PATH to power9 event list") Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220419114828.89843-1-atrajeev@linux.vnet.ibm.com Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- arch/powerpc/perf/power9-pmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 2a57e93a79dc..7245355bee28 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -133,11 +133,11 @@ int p9_dd22_bl_ev[] = { /* Table of alternatives, sorted by column 0 */ static const unsigned int power9_event_alternatives[][MAX_ALT] = { - { PM_INST_DISP, PM_INST_DISP_ALT }, - { PM_RUN_CYC_ALT, PM_RUN_CYC }, - { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, - { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, { PM_BR_2PATH, PM_BR_2PATH_ALT }, + { PM_INST_DISP, PM_INST_DISP_ALT }, + { PM_RUN_CYC_ALT, PM_RUN_CYC }, + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, + { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, }; static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[]) -- Gitee From ce66add4b99dde3edf99491d41ea364b6005683c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 26 Jul 2022 17:39:24 +0800 Subject: [PATCH 087/132] perf report: Set PERF_SAMPLE_DATA_SRC bit for Arm SPE event stable inclusion from stable-v5.10.113 commit 19590bbc691d81f03d2a24a3ec30c399ebe071e0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19590bbc691d81f03d2a24a3ec30c399ebe071e0 -------------------------------- [ Upstream commit ccb17caecfbd542f49a2a79ae088136ba8bfb794 ] Since commit bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available") "perf mem report" and "perf report --mem-mode" don't report result if the PERF_SAMPLE_DATA_SRC bit is missed in sample type. The commit ffab487052054162 ("perf: arm-spe: Fix perf report --mem-mode") partially fixes the issue. It adds PERF_SAMPLE_DATA_SRC bit for Arm SPE event, this allows the perf data file generated by kernel v5.18-rc1 or later version can be reported properly. On the other hand, perf tool still fails to be backward compatibility for a data file recorded by an older version's perf which contains Arm SPE trace data. This patch is a workaround in reporting phase, when detects ARM SPE PMU event and without PERF_SAMPLE_DATA_SRC bit, it will force to set the bit in the sample type and give a warning info. Fixes: bb30acae4c4dacfa ("perf report: Bail out --mem-mode if mem info is not available") Reviewed-by: James Clark Signed-off-by: Leo Yan Tested-by: German Gomez Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20220414123201.842754-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- tools/perf/builtin-report.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 91cab5cdfbc1..b55ee073c2f7 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -340,6 +340,7 @@ static int report__setup_sample_type(struct report *rep) struct perf_session *session = rep->session; u64 sample_type = evlist__combined_sample_type(session->evlist); bool is_pipe = perf_data__is_pipe(session->data); + struct evsel *evsel; if (session->itrace_synth_opts->callchain || session->itrace_synth_opts->add_callchain || @@ -394,6 +395,19 @@ static int report__setup_sample_type(struct report *rep) } if (sort__mode == SORT_MODE__MEMORY) { + /* + * FIXUP: prior to kernel 5.18, Arm SPE missed to set + * PERF_SAMPLE_DATA_SRC bit in sample type. For backward + * compatibility, set the bit if it's an old perf data file. + */ + evlist__for_each_entry(session->evlist, evsel) { + if (strstr(evsel->name, "arm_spe") && + !(sample_type & PERF_SAMPLE_DATA_SRC)) { + evsel->core.attr.sample_type |= PERF_SAMPLE_DATA_SRC; + sample_type |= PERF_SAMPLE_DATA_SRC; + } + } + if (!is_pipe && !(sample_type & PERF_SAMPLE_DATA_SRC)) { ui__error("Selected --mem-mode but no mem data. " "Did you call perf record without -d?\n"); -- Gitee From 97cada6b51f5737aba63d7d3f69cf413a3c30a1b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 26 Jul 2022 17:39:25 +0800 Subject: [PATCH 088/132] ext4: fix fallocate to use file_modified to update permissions consistently stable inclusion from stable-v5.10.113 commit f6038d43b25bba1cd50d2a77e207f6550aee9954 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f6038d43b25bba1cd50d2a77e207f6550aee9954 -------------------------------- commit ad5cd4f4ee4d5fcdb1bfb7a0c073072961e70783 upstream. Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero, collapse, insert range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220308185043.GA117678@magnolia Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 32 +++++++++++++++++++++++++------- fs/ext4/inode.c | 7 ++++++- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 277f89d5de03..df43cda8fc2f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2908,7 +2908,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); -extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4323186bae78..9d06695c04ab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4510,9 +4510,9 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, return ret > 0 ? ret2 : ret; } -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) @@ -4583,6 +4583,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4707,17 +4711,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto exit; if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(inode, offset, len); + ret = ext4_punch_hole(file, offset, len); goto exit; } if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(inode, offset, len); + ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(inode, offset, len); + ret = ext4_insert_range(file, offset, len); goto exit; } @@ -4753,6 +4757,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out; + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; @@ -5255,8 +5263,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t punch_start, punch_stop; handle_t *handle; @@ -5307,6 +5316,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. @@ -5401,8 +5414,9 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * by len bytes. * Returns 0 on success, error otherwise. */ -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; handle_t *handle; struct ext4_ext_path *path; @@ -5458,6 +5472,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e85c238edd85..69fcf82f2c81 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3972,8 +3972,9 @@ int ext4_break_layouts(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; @@ -4026,6 +4027,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. -- Gitee From aaf9e2faf5bb4ea97264d9f45f6c6e35f73f17e8 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Tue, 26 Jul 2022 17:39:26 +0800 Subject: [PATCH 089/132] ext4: limit length to bitmap_maxbytes - blocksize in punch_hole stable inclusion from stable-v5.10.113 commit 22c450d39f8922ae26de459cf4f83b2b294f207e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=22c450d39f8922ae26de459cf4f83b2b294f207e -------------------------------- commit 2da376228a2427501feb9d15815a45dbdbdd753e upstream. Syzbot found an issue [1] in ext4_fallocate(). The C reproducer [2] calls fallocate(), passing size 0xffeffeff000ul, and offset 0x1000000ul, which, when added together exceed the bitmap_maxbytes for the inode. This triggers a BUG in ext4_ind_remove_space(). According to the comments in this function the 'end' parameter needs to be one block after the last block to be removed. In the case when the BUG is triggered it points to the last block. Modify the ext4_punch_hole() function and add constraint that caps the length to satisfy the one before laster block requirement. LINK: [1] https://syzkaller.appspot.com/bug?id=b80bd9cf348aac724a4f4dff251800106d721331 LINK: [2] https://syzkaller.appspot.com/text?tag=ReproC&x=14ba0238700000 Fixes: a4bb6b64e39a ("ext4: enable "punch hole" functionality") Reported-by: syzbot+7a806094edd5d07ba029@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Link: https://lore.kernel.org/r/20220331200515.153214-1-tadeusz.struk@linaro.org Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/ext4/inode.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 69fcf82f2c81..f4e0c7cc4820 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3978,7 +3978,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset; + loff_t first_block_offset, last_block_offset, max_length; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; @@ -4012,6 +4013,14 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } + /* + * For punch hole the length + offset needs to be within one block + * before last range. Adjust the length if it goes beyond that limit. + */ + max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; + if (offset + length > max_length) + length = max_length - offset; + if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* -- Gitee From cf4de22f4fd1ee1f6367a12848686f71d1293010 Mon Sep 17 00:00:00 2001 From: "wangjianjian (C)" Date: Tue, 26 Jul 2022 17:39:27 +0800 Subject: [PATCH 090/132] ext4, doc: fix incorrect h_reserved size stable inclusion from stable-v5.10.113 commit 0c54b093766becb9c83317232c93290cf612b8ff category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=0c54b093766becb9c83317232c93290cf612b8ff -------------------------------- commit 7102ffe4c166ca0f5e35137e9f9de83768c2d27d upstream. According to document and code, ext4_xattr_header's size is 32 bytes, so h_reserved size should be 3. Signed-off-by: Wang Jianjian Link: https://lore.kernel.org/r/92fcc3a6-7d77-8c09-4126-377fcb4c46a5@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- Documentation/filesystems/ext4/attributes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/ext4/attributes.rst b/Documentation/filesystems/ext4/attributes.rst index 54386a010a8d..871d2da7a0a9 100644 --- a/Documentation/filesystems/ext4/attributes.rst +++ b/Documentation/filesystems/ext4/attributes.rst @@ -76,7 +76,7 @@ The beginning of an extended attribute block is in - Checksum of the extended attribute block. * - 0x14 - \_\_u32 - - h\_reserved[2] + - h\_reserved[3] - Zero. The checksum is calculated against the FS UUID, the 64-bit block number -- Gitee From 27d40cc66fb275b8c38be0766f63cd14118d4423 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 26 Jul 2022 17:39:28 +0800 Subject: [PATCH 091/132] ext4: fix overhead calculation to account for the reserved gdt blocks stable inclusion from stable-v5.10.113 commit 4789149b9ea2a1893c62d816742f1a76514fc901 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4789149b9ea2a1893c62d816742f1a76514fc901 -------------------------------- commit 10b01ee92df52c8d7200afead4d5e5f55a5c58b1 upstream. The kernel calculation was underestimating the overhead by not taking into account the reserved gdt blocks. With this change, the overhead calculated by the kernel matches the overhead calculation in mke2fs. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/ext4/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 26c938360895..34e4e00445da 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3912,9 +3912,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) - return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + -- Gitee From fd9c2ff88c62fbb32351bc8165f473a54e91817c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 26 Jul 2022 17:39:29 +0800 Subject: [PATCH 092/132] ext4: force overhead calculation if the s_overhead_cluster makes no sense stable inclusion from stable-v5.10.113 commit e1e96e37272156d691203a3725b876787f38c8f2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e1e96e37272156d691203a3725b876787f38c8f2 -------------------------------- commit 85d825dbf4899a69407338bae462a59aa9a37326 upstream. If the file system does not use bigalloc, calculating the overhead is cheap, so force the recalculation of the overhead so we don't have to trust the precalculated overhead in the superblock. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- fs/ext4/super.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 34e4e00445da..706a159817ee 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4977,9 +4977,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Get the # of file system overhead blocks from the * superblock if present. */ - if (es->s_overhead_clusters) - sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); - else { + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; -- Gitee From f120da4c1f854aadb526a8fd86e3caf0c9fdbe5e Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 26 Jul 2022 17:39:30 +0800 Subject: [PATCH 093/132] can: isotp: stop timeout monitoring when no first frame was sent stable inclusion from stable-v5.10.113 commit 50aac44273600cb0ae1efd010bb1de7701444a41 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=50aac44273600cb0ae1efd010bb1de7701444a41 -------------------------------- commit d73497081710c876c3c61444445512989e102152 upstream. The first attempt to fix a the 'impossible' WARN_ON_ONCE(1) in isotp_tx_timer_handler() focussed on the identical CAN IDs created by the syzbot reproducer and lead to upstream fix/commit 3ea566422cbd ("can: isotp: sanitize CAN ID checks in isotp_bind()"). But this did not catch the root cause of the wrong tx.state in the tx_timer handler. In the isotp 'first frame' case a timeout monitoring needs to be started before the 'first frame' is send. But when this sending failed the timeout monitoring for this specific frame has to be disabled too. Otherwise the tx_timer is fired with the 'warn me' tx.state of ISOTP_IDLE. Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Link: https://lore.kernel.org/all/20220405175112.2682-1-socketcan@hartkopp.net Reported-by: syzbot+2339c27f5c66c652843e@syzkaller.appspotmail.com Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- net/can/isotp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index 9a4a9c5a9f24..c515bbd46c67 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -864,6 +864,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; + s64 hrtimer_sec = 0; int off; int err; @@ -962,7 +963,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) isotp_create_fframe(cf, so, ae); /* start timeout for FC */ - hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + hrtimer_sec = 1; + hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), + HRTIMER_MODE_REL_SOFT); } /* send the first or only CAN frame */ @@ -975,6 +978,11 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %d\n", __func__, err); + + /* no transmission -> no timeout monitoring */ + if (hrtimer_sec) + hrtimer_cancel(&so->txtimer); + goto err_out_drop; } -- Gitee From 0c5ab04a2f48e4ea5814c339410bc679062b41ea Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 26 Jul 2022 17:39:31 +0800 Subject: [PATCH 094/132] spi: atmel-quadspi: Fix the buswidth adjustment between spi-mem and controller stable inclusion from stable-v5.10.113 commit dccee748af17fc087ff5017152e532ef8e18c8c0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=dccee748af17fc087ff5017152e532ef8e18c8c0 -------------------------------- commit 8c235cc25087495c4288d94f547e9d3061004991 upstream. Use the spi_mem_default_supports_op() core helper in order to take into account the buswidth specified by the user in device tree. Cc: Fixes: 0e6aae08e9ae ("spi: Add QuadSPI driver for Atmel SAMA5D2") Signed-off-by: Tudor Ambarus Link: https://lore.kernel.org/r/20220406133604.455356-1-tudor.ambarus@microchip.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/spi/atmel-quadspi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index 1e63fd4821f9..8aa89d93db11 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -277,6 +277,9 @@ static int atmel_qspi_find_mode(const struct spi_mem_op *op) static bool atmel_qspi_supports_op(struct spi_mem *mem, const struct spi_mem_op *op) { + if (!spi_mem_default_supports_op(mem, op)) + return false; + if (atmel_qspi_find_mode(op) < 0) return false; -- Gitee From dd76b5fe66faa3615f8d284d1fd07f1cccabf125 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 26 Jul 2022 17:39:32 +0800 Subject: [PATCH 095/132] staging: ion: Prevent incorrect reference counting behavour stable inclusion from stable-v5.10.113 commit fea24b07edfc348c67a019b6e17b39c0698e631f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=fea24b07edfc348c67a019b6e17b39c0698e631f -------------------------------- Supply additional check in order to prevent unexpected results. Fixes: b892bf75b2034 ("ion: Switch ion to use dma-buf") Suggested-by: Dan Carpenter Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/staging/android/ion/ion.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index e1fe03ceb7f1..e6d4a3ee6cda 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -114,6 +114,9 @@ static void *ion_buffer_kmap_get(struct ion_buffer *buffer) void *vaddr; if (buffer->kmap_cnt) { + if (buffer->kmap_cnt == INT_MAX) + return ERR_PTR(-EOVERFLOW); + buffer->kmap_cnt++; return buffer->vaddr; } -- Gitee From 9a1fc0d8feaf050654872afaa9e66323f12e4a11 Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 26 Jul 2022 17:39:33 +0800 Subject: [PATCH 096/132] block/compat_ioctl: fix range check in BLKGETSIZE stable inclusion from stable-v5.10.113 commit 8bedbc8f7f35f533000b347644b6bf1f62524676 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=8bedbc8f7f35f533000b347644b6bf1f62524676 -------------------------------- commit ccf16413e520164eb718cf8b22a30438da80ff23 upstream. kernel ulong and compat_ulong_t may not be same width. Use type directly to eliminate mismatches. This would result in truncation rather than EFBIG for 32bit mode for large disks. Reviewed-by: Bart Van Assche Signed-off-by: Khazhismel Kumykov Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220414224056.2875681-1-khazhy@google.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- block/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/ioctl.c b/block/ioctl.c index 8171858dc8a9..7c578f84a4fd 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -683,7 +683,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if ((size >> 9) > ~(compat_ulong_t)0) return -EFBIG; return compat_put_ulong(argp, size >> 9); -- Gitee From 156b297acb77278b3f11077a9e8f92e81c87aebe Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 26 Jul 2022 17:39:34 +0800 Subject: [PATCH 097/132] Revert "net: micrel: fix KS8851_MLL Kconfig" stable inclusion from stable-v5.10.113 commit 7992fdb045fbc7cb0e34eba464b73044585c0638 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ISAH Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7992fdb045fbc7cb0e34eba464b73044585c0638 -------------------------------- This reverts commit 1ff5359afa5ec0dd09fe76183dc4fa24b50e4125 which is commit c3efcedd272aa6dd5929e20cf902a52ddaa1197a upstream. The upstream commit c3efcedd272a ("net: micrel: fix KS8851_MLL Kconfig") depends on e5f31552674e ("ethernet: fix PTP_1588_CLOCK dependencies") which is not part of Linux 5.10.y . Revert the aforementioned commit to prevent breakage in 5.10.y . Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Randy Dunlap Cc: Sasha Levin Cc: # 5.10.x Signed-off-by: Greg Kroah-Hartman Signed-off-by: Zheng Zengkai Acked-by: Xie XiuQi --- drivers/net/ethernet/micrel/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig index 9ceb7e1fb169..42bc014136fe 100644 --- a/drivers/net/ethernet/micrel/Kconfig +++ b/drivers/net/ethernet/micrel/Kconfig @@ -37,7 +37,6 @@ config KS8851 config KS8851_MLL tristate "Micrel KS8851 MLL" depends on HAS_IOMEM - depends on PTP_1588_CLOCK_OPTIONAL select MII select CRC32 select EEPROM_93CX6 -- Gitee From 03a99552eb74a789de7de2a13b0ea9b31dcf2dfa Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 28 Jul 2022 18:06:31 +0800 Subject: [PATCH 098/132] bpf: Don't redirect packets with invalid pkt_len mainline inclusion from mainline-v5.19-rc6 commit fd1894224407c484f652ad456e1ce423e89bb3eb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HWKR CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fd1894224407 -------------------------------- Syzbot found an issue [1]: fq_codel_drop() try to drop a flow whitout any skbs, that is, the flow->head is null. The root cause, as the [2] says, is because that bpf_prog_test_run_skb() run a bpf prog which redirects empty skbs. So we should determine whether the length of the packet modified by bpf prog or others like bpf_prog_test is valid before forwarding it directly. LINK: [1] https://syzkaller.appspot.com/bug?id=0b84da80c2917757915afa89f7738a9d16ec96c5 LINK: [2] https://www.spinics.net/lists/netdev/msg777503.html Reported-by: syzbot+7a12909485b94426aceb@syzkaller.appspotmail.com Signed-off-by: Zhengchao Shao Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220715115559.139691-1-shaozhengchao@huawei.com Signed-off-by: Alexei Starovoitov Reviewed-by: Wei Yongjun Reviewed-by: Yue Haibing Signed-off-by: Zheng Zengkai --- include/linux/skbuff.h | 8 ++++++++ net/bpf/test_run.c | 3 +++ net/core/dev.c | 1 + 3 files changed, 12 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 68efccc15a87..72cfb047fd2c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2251,6 +2251,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) #endif /* NET_SKBUFF_DATA_USES_OFFSET */ +static inline void skb_assert_len(struct sk_buff *skb) +{ +#ifdef CONFIG_DEBUG_NET + if (WARN_ONCE(!skb->len, "%s\n", __func__)) + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); +#endif /* CONFIG_DEBUG_NET */ +} + /* * Add data to an sk_buff */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 99712d35e535..f266a9453c8e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -398,6 +398,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + if (!skb->len) + return -EINVAL; + if (!__skb) return 0; diff --git a/net/core/dev.c b/net/core/dev.c index 12089c484b30..8e4de36eede8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4094,6 +4094,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) bool again = false; skb_reset_mac_header(skb); + skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); -- Gitee From 92561effc05f39fbf33d80fd0f9e5978fb7b3a71 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 28 Jul 2022 18:06:32 +0800 Subject: [PATCH 099/132] ucounts: add missing data type changes mainline inclusion from mainline-v5.14-rc6 commit f153c2246783ba210493054d99c66353f56423c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5IDIC CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f153c2246783ba210493054d99c66353f56423c9 -------------------------------- commit f9c82a4ea89c3 ("Increase size of ucounts to atomic_long_t") changed the data type of ucounts/ucounts_max to long, but missed to adjust a few other places. This is noticeable on big endian platforms from user space because the /proc/sys/user/max_*_names files all contain 0. v4 - Made the min and max constants long so the sysctl values are actually settable on little endian machines. -- EWB Fixes: f9c82a4ea89c ("Increase size of ucounts to atomic_long_t") Signed-off-by: Sven Schnelle Tested-by: Nathan Chancellor Tested-by: Linux Kernel Functional Testing Acked-by: Alexey Gladkov v1: https://lkml.kernel.org/r/20210721115800.910778-1-svens@linux.ibm.com v2: https://lkml.kernel.org/r/20210721125233.1041429-1-svens@linux.ibm.com v3: https://lkml.kernel.org/r/20210730062854.3601635-1-svens@linux.ibm.com Link: https://lkml.kernel.org/r/8735rijqlv.fsf_-_@disp2133 Signed-off-by: Eric W. Biederman Conflict: fs/notify/fanotify/fanotify_user.c Signed-off-by: Li Nan Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/notify/inotify/inotify_user.c | 17 +++++++++++------ kernel/ucount.c | 19 +++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5f6c6bf65909..f13b64729d08 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -46,22 +46,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #include +static long it_zero = 0; +static long it_int_max = INT_MAX; + struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_queued_events", diff --git a/kernel/ucount.c b/kernel/ucount.c index dff1d9b739d2..1f5825b674d8 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -52,14 +52,17 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, }; -#define UCOUNT_ENTRY(name) \ - { \ - .procname = name, \ - .maxlen = sizeof(int), \ - .mode = 0644, \ - .proc_handler = proc_dointvec_minmax, \ - .extra1 = SYSCTL_ZERO, \ - .extra2 = SYSCTL_INT_MAX, \ +static long ue_zero = 0; +static long ue_int_max = INT_MAX; + +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(long), \ + .mode = 0644, \ + .proc_handler = proc_doulongvec_minmax, \ + .extra1 = &ue_zero, \ + .extra2 = &ue_int_max, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), -- Gitee From a51eb65ba1c5c5fcf7c026732d5ace1f9cbaf8f9 Mon Sep 17 00:00:00 2001 From: Luo Meng Date: Thu, 28 Jul 2022 18:06:33 +0800 Subject: [PATCH 100/132] block: Fix warning in bd_link_disk_holder() hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA -------------------------------- Warning reports as follows: WARNING: CPU: 3 PID: 674 at fs/block_dev.c:1272 bd_link_disk_holder+0xcd/0x270 Modules linked in: null_blk(+) CPU: 3 PID: 674 Comm: dmsetup Not tainted 5.10.0-16691-gf6076432827d-dirty #158 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4 RIP: 0010:bd_link_disk_holder+0xcd/0x270 Code: 69 73 ee 00 44 89 e8 5b 48 83 05 c5 bf 6d 0c 01 5d 41 5c 41 5d 41 5e 41 8 RSP: 0018:ffffc9000049bbb8 EFLAGS: 00010202 RAX: ffff888104e39038 RBX: ffff888104185000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffaa085692 RDI: 0000000000000000 RBP: ffff88810cc2ae00 R08: ffffffffa853659b R09: 0000000000000000 R10: ffffc9000049bbb0 R11: 720030626c6c756e R12: ffff88810e800000 R13: ffff88810e800090 R14: ffff888103570c98 R15: ffff888103570c80 FS: 00007fb49dc13dc0(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff994ebde70 CR3: 000000010d54a000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: dm_get_table_device+0x175/0x300 dm_get_device+0x238/0x360 linear_ctr+0xee/0x170 dm_table_add_target+0x199/0x4b0 table_load+0x18c/0x480 ? table_clear+0x190/0x190 ctl_ioctl+0x21d/0x640 ? check_preemption_disabled+0x140/0x150 dm_ctl_ioctl+0x12/0x20 __se_sys_ioctl+0xb1/0x100 __x64_sys_ioctl+0x1e/0x30 do_syscall_64+0x45/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This can reproduce by concurrent operations: 1. modprobe null_blk 2. echo -e "0 10000 linear /dev/nullb0 0" > table dmsetup create xxx table t1: create disk a | t2: dm setup | device_add_disk | dev->devt = devt | | dm_get_table_device | open_table_device | blkdev_get_by_dev -> succeed | bd_link_disk_holder | -> holder_dir is still NULL register_disk -> create holder_dir kobject_create_and_add device_add_disk() will set devt before creating holder_dir, which leaves a window that dm_get_table_device() can find the disk by devt while it's holder_dir is NULL. So move GENHD_FL_UP in blk_register_queue() to avoid this warning and fix a NULL-ptr in __blk_mq_sched_bio_merge(). Signed-off-by: Luo Meng Reviewed-by: Jason Yan Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-sysfs.c | 6 ++++++ block/genhd.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0a4fcbda8ab4..aff53c3ae836 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -910,6 +910,12 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); + + /* + * Set the flag at last, so that block devcie can't be opened + * before it's registration is done. + */ + disk->flags |= GENHD_FL_UP; ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/genhd.c b/block/genhd.c index 8b37fcfa10d1..9d91f880ea95 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -799,8 +799,6 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, WARN_ON(!disk->minors && !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); - disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(&disk->part0, &devt); if (retval) { WARN_ON(1); -- Gitee From d7c2ddc8456d91991dbc87da304742d60d577b41 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 28 Jul 2022 18:06:34 +0800 Subject: [PATCH 101/132] block: fix that part scan is disabled in device_add_disk() hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA -------------------------------- Patch ("block: Fix warning in bd_link_disk_holder()") moves the setting of flag 'GENHD_FL_UP' behind blkdev_get, which will disabled part scan: devcie_add_disk register_disk blkdev_get __blkdev_get bdev_get_gendisk get_gendisk -> failed because 'GENHD_FL_UP' is not set And this will cause tests block/017, block/018 and scsi/004 to fail. Fix the problem by moving part scan as well. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-sysfs.c | 33 +++++++++++++++++++++++++++++++++ block/genhd.c | 27 --------------------------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index aff53c3ae836..f7cd16cec0ed 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -821,6 +821,38 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + +static void disk_init_partition(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct disk_part_iter piter; + struct hd_struct *part; + + disk_scan_partitions(disk); + + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -916,6 +948,7 @@ int blk_register_queue(struct gendisk *disk) * before it's registration is done. */ disk->flags |= GENHD_FL_UP; + disk_init_partition(disk); ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/genhd.c b/block/genhd.c index 9d91f880ea95..021c9c2d7231 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -687,25 +687,10 @@ static int exact_lock(dev_t devt, void *data) return 0; } -static void disk_scan_partitions(struct gendisk *disk) -{ - struct block_device *bdev; - - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; - - set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); -} - static void register_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct hd_struct *part; int err; ddev->parent = parent; @@ -743,18 +728,6 @@ static void register_disk(struct device *parent, struct gendisk *disk, if (disk->flags & GENHD_FL_HIDDEN) return; - disk_scan_partitions(disk); - - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); - if (disk->queue->backing_dev_info->dev) { err = sysfs_create_link(&ddev->kobj, &disk->queue->backing_dev_info->dev->kobj, -- Gitee From 67b9d2775f9adafd9a4f5d8a7cc05f9cce884f1a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 28 Jul 2022 18:06:35 +0800 Subject: [PATCH 102/132] block: prevent lockdep false positive warning about 'bd_mutex' hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA -------------------------------- Patch ("block: fix that part scan is disabled in device_add_disk()") confuse lockdep to produce following warning: ===================================================== WARNING: possible circular locking dependency detected 4.18.0+ #2 Tainted: G ---------r- - ------------------------------------------------------ syz-executor.0/4652 is trying to acquire lock: 00000000ad5f5a19 (&mddev->open_mutex){+.+.}, at: md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626 but task is already holding lock: 000000005c3a3fea (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&bdev->bd_mutex){+.+.}: __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 disk_init_partition home/install/linux-rh-3-10/block/blk-sysfs.c:972 [inline] blk_register_queue+0x5ed/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1055 __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729 sd_probe_async+0x447/0x852 home/install/linux-rh-3-10/drivers/scsi/sd.c:3249 async_run_entry_fn+0xe1/0x700 home/install/linux-rh-3-10/kernel/async.c:127 process_one_work+0x9cf/0x1940 home/install/linux-rh-3-10/kernel/workqueue.c:2175 worker_thread+0x91/0xc50 home/install/linux-rh-3-10/kernel/workqueue.c:2321 kthread+0x33a/0x400 home/install/linux-rh-3-10/kernel/kthread.c:257 ret_from_fork+0x3a/0x50 home/install/linux-rh-3-10/arch/x86/entry/entry_64.S:355 -> #1 (&q->sysfs_dir_lock){+.+.}: __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 blk_register_queue+0x143/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1010 __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729 add_disk home/install/linux-rh-3-10/./include/linux/genhd.h:447 [inline] md_alloc+0xb06/0x10d0 home/install/linux-rh-3-10/drivers/md/md.c:5525 md_probe+0x32/0x60 home/install/linux-rh-3-10/drivers/md/md.c:5554 kobj_lookup+0x2d2/0x450 home/install/linux-rh-3-10/drivers/base/map.c:152 get_gendisk+0x3b/0x360 home/install/linux-rh-3-10/block/genhd.c:860 bdev_get_gendisk home/install/linux-rh-3-10/fs/block_dev.c:1181 [inline] __blkdev_get+0x3b6/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1578 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923 do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777 do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline] path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578 do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613 do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf -> #0 (&mddev->open_mutex){+.+.}: lock_acquire+0x10b/0x3a0 home/install/linux-rh-3-10/kernel/locking/lockdep.c:3868 __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626 __blkdev_get+0x2dc/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1599 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923 do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777 do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline] path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578 do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613 do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf other info that might help us debug this: Chain exists of: &mddev->open_mutex --> &q->sysfs_dir_lock --> &bdev->bd_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&bdev->bd_mutex); lock(&q->sysfs_dir_lock); lock(&bdev->bd_mutex); lock(&mddev->open_mutex); *** DEADLOCK *** Since 'bd_mutex' and 'sysfs_dir_lock' is different is for each device, deadlock between md_open() and sd_probe_async() is impossible. However, lockdep is treating 'bd_mutex' and 'sysfs_dir_lock' from different devices the same, and patch "block: fix that part scan is disabled in device_add_disk()" is holding 'bd_mutex' inside 'sysfs_dir_lock', which causes the false positive warning. Fix the false positive warning by don't grab 'bd_mutex' inside 'sysfs_dir_lock'. Signed-off-by: Yu Kuai Reviewed-by: Jason Yan Signed-off-by: Zheng Zengkai --- block/blk-sysfs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f7cd16cec0ed..548d758365c6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -948,10 +948,17 @@ int blk_register_queue(struct gendisk *disk) * before it's registration is done. */ disk->flags |= GENHD_FL_UP; - disk_init_partition(disk); ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); + /* + * Init partitions after releasing 'sysfs_dir_lock', otherwise lockdep + * will be confused because it will treat 'bd_mutex' from different + * devices as the same lock. + */ + if (!ret) + disk_init_partition(disk); + return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); -- Gitee From 77eabe0b93e9dd68b400d52c9caa9282ca6094df Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 28 Jul 2022 18:06:36 +0800 Subject: [PATCH 103/132] inotify: show inotify mask flags in proc fdinfo mainline inclusion from mainline-v5.19-rc1 commit a32e697cda27679a0327ae2cafdad8c7170f548f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5IHD1 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=a32e697cda27679a0327ae2cafdad8c7170f548f -------------------------------- The inotify mask flags IN_ONESHOT and IN_EXCL_UNLINK are not "internal to kernel" and should be exposed in procfs fdinfo so CRIU can restore them. Fixes: 6933599697c9 ("inotify: hide internal kernel bits from fdinfo") Link: https://lore.kernel.org/r/20220422120327.3459282-2-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Signed-off-by: Li Nan Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/notify/fdinfo.c | 11 ++--------- fs/notify/inotify/inotify.h | 12 ++++++++++++ fs/notify/inotify/inotify_user.c | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index f0d6b54be412..765b50aeadd2 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -83,16 +83,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { - /* - * IN_ALL_EVENTS represents all of the mask bits - * that we expose to userspace. There is at - * least one bit (FS_EVENT_ON_CHILD) which is - * used only internally to the kernel. - */ - u32 mask = mark->mask & IN_ALL_EVENTS; - seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", + seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mask, mark->ignored_mask); + inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 2007e3711916..8f00151eb731 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -22,6 +22,18 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct inotify_event_info, fse); } +/* + * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to + * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ +#define INOTIFY_USER_MASK (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK) + +static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark) +{ + return fsn_mark->mask & INOTIFY_USER_MASK; +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index f13b64729d08..3986f1877457 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -93,7 +93,7 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) mask |= FS_EVENT_ON_CHILD; /* mask off the flags used to open the fd */ - mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)); + mask |= (arg & INOTIFY_USER_MASK); return mask; } -- Gitee From c248fb22ed21b7bf4308bfcdf962d11b4f73f62c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Jul 2022 18:06:37 +0800 Subject: [PATCH 104/132] fbcon: Disallow setting font bigger than screen size stable inclusion from stable-v5.10.130 commit b727561ddc9360de9631af2d970d8ffed676a750 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b727561ddc9360de9631af2d970d8ffed676a750 -------------------------------- commit 65a01e601dbba8b7a51a2677811f70f783766682 upstream. Prevent that users set a font size which is bigger than the physical screen. It's unlikely this may happen (because screens are usually much larger than the fonts and each font char is limited to 32x32 pixels), but it may happen on smaller screens/LCD displays. Signed-off-by: Helge Deller Reviewed-by: Daniel Vetter Reviewed-by: Geert Uytterhoeven Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Reviewed-by: Xiu Jianfeng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- drivers/video/fbdev/core/fbcon.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index f102519ccefb..8d81e9321cf7 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2510,6 +2510,11 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, if (charcount != 256 && charcount != 512) return -EINVAL; + /* font bigger than screen resolution ? */ + if (w > FBCON_SWAP(info->var.rotate, info->var.xres, info->var.yres) || + h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres)) + return -EINVAL; + /* Make sure drawing engine can handle the font */ if (!(info->pixmap.blit_x & (1 << (font->width - 1))) || !(info->pixmap.blit_y & (1 << (font->height - 1)))) -- Gitee From e664e9804a489f76e72be7601f53654b7d29014f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Jul 2022 18:06:38 +0800 Subject: [PATCH 105/132] fbcon: Prevent that screen size is smaller than font size stable inclusion from stable-v5.10.130 commit cecb806c766c78e1be62b6b7b1483ef59bbaeabe category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cecb806c766c78e1be62b6b7b1483ef59bbaeabe -------------------------------- commit e64242caef18b4a5840b0e7a9bff37abd4f4f933 upstream. We need to prevent that users configure a screen size which is smaller than the currently selected font size. Otherwise rendering chars on the screen will access memory outside the graphics memory region. This patch adds a new function fbcon_modechange_possible() which implements this check and which later may be extended with other checks if necessary. The new function is called from the FBIOPUT_VSCREENINFO ioctl handler in fbmem.c, which will return -EINVAL if userspace asked for a too small screen size. Signed-off-by: Helge Deller Reviewed-by: Geert Uytterhoeven Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Reviewed-by: Xiu Jianfeng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- drivers/video/fbdev/core/fbcon.c | 28 ++++++++++++++++++++++++++++ drivers/video/fbdev/core/fbmem.c | 4 +++- include/linux/fbcon.h | 4 ++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 8d81e9321cf7..b4260a830e78 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2776,6 +2776,34 @@ void fbcon_update_vcs(struct fb_info *info, bool all) } EXPORT_SYMBOL(fbcon_update_vcs); +/* let fbcon check if it supports a new screen resolution */ +int fbcon_modechange_possible(struct fb_info *info, struct fb_var_screeninfo *var) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct vc_data *vc; + unsigned int i; + + WARN_CONSOLE_UNLOCKED(); + + if (!ops) + return 0; + + /* prevent setting a screen size which is smaller than font size */ + for (i = first_fb_vc; i <= last_fb_vc; i++) { + vc = vc_cons[i].d; + if (!vc || vc->vc_mode != KD_TEXT || + registered_fb[con2fb_map[i]] != info) + continue; + + if (vc->vc_font.width > FBCON_SWAP(var->rotate, var->xres, var->yres) || + vc->vc_font.height > FBCON_SWAP(var->rotate, var->yres, var->xres)) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fbcon_modechange_possible); + int fbcon_mode_deleted(struct fb_info *info, struct fb_videomode *mode) { diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 00939ca2065a..d2cffaf3fda8 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1109,7 +1109,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, return -EFAULT; console_lock(); lock_fb_info(info); - ret = fb_set_var(info, &var); + ret = fbcon_modechange_possible(info, &var); + if (!ret) + ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h index ff5596dd30f8..2382dec6d6ab 100644 --- a/include/linux/fbcon.h +++ b/include/linux/fbcon.h @@ -15,6 +15,8 @@ void fbcon_new_modelist(struct fb_info *info); void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps); void fbcon_fb_blanked(struct fb_info *info, int blank); +int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var); void fbcon_update_vcs(struct fb_info *info, bool all); void fbcon_remap_all(struct fb_info *info); int fbcon_set_con2fb_map_ioctl(void __user *argp); @@ -33,6 +35,8 @@ static inline void fbcon_new_modelist(struct fb_info *info) {} static inline void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps) {} static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {} +static inline int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var) { return 0; } static inline void fbcon_update_vcs(struct fb_info *info, bool all) {} static inline void fbcon_remap_all(struct fb_info *info) {} static inline int fbcon_set_con2fb_map_ioctl(void __user *argp) { return 0; } -- Gitee From 9ba05bbca9335251bfdaa95a344034c31969c8a0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Jul 2022 18:06:39 +0800 Subject: [PATCH 106/132] fbmem: Check virtual screen sizes in fb_set_var() stable inclusion from stable-v5.10.130 commit b81212828ad19ab3eccf00626cd04099215060bf category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b81212828ad19ab3eccf00626cd04099215060bf -------------------------------- commit 6c11df58fd1ac0aefcb3b227f72769272b939e56 upstream. Verify that the fbdev or drm driver correctly adjusted the virtual screen sizes. On failure report the failing driver and reject the screen size change. Signed-off-by: Helge Deller Reviewed-by: Geert Uytterhoeven Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Greg Kroah-Hartman Signed-off-by: Chen Jun Reviewed-by: Xiu Jianfeng Reviewed-by: Weilong Chen Signed-off-by: Zheng Zengkai --- drivers/video/fbdev/core/fbmem.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index d2cffaf3fda8..3b3ccb235522 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1019,6 +1019,16 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var) if (ret) return ret; + /* verify that virtual resolution >= physical resolution */ + if (var->xres_virtual < var->xres || + var->yres_virtual < var->yres) { + pr_warn("WARNING: fbcon: Driver '%s' missed to adjust virtual screen size (%ux%u vs. %ux%u)\n", + info->fix.id, + var->xres_virtual, var->yres_virtual, + var->xres, var->yres); + return -EINVAL; + } + if ((var->activate & FB_ACTIVATE_MASK) != FB_ACTIVATE_NOW) return 0; -- Gitee From 89e9ad6dfaef94bddf0aa0dcd1e2c0bd69dcbb57 Mon Sep 17 00:00:00 2001 From: Eric Snowberg Date: Thu, 28 Jul 2022 18:06:40 +0800 Subject: [PATCH 107/132] lockdown: Fix kexec lockdown bypass with ima policy mainline inclusion from mainline-v5.19-rc8 commit 543ce63b664e2c2f9533d089a4664b559c3e6b5b category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5I0FP CVE: CVE-2022-21505 Reference: https://seclists.org/oss-sec/2022/q3/57 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=543ce63b664e2c2f9533d089a4664b559c3e6b5b -------------------------------- The lockdown LSM is primarily used in conjunction with UEFI Secure Boot. This LSM may also be used on machines without UEFI. It can also be enabled when UEFI Secure Boot is disabled. One of lockdown's features is to prevent kexec from loading untrusted kernels. Lockdown can be enabled through a bootparam or after the kernel has booted through securityfs. If IMA appraisal is used with the "ima_appraise=log" boot param, lockdown can be defeated with kexec on any machine when Secure Boot is disabled or unavailable. IMA prevents setting "ima_appraise=log" from the boot param when Secure Boot is enabled, but this does not cover cases where lockdown is used without Secure Boot. To defeat lockdown, boot without Secure Boot and add ima_appraise=log to the kernel command line; then: $ echo "integrity" > /sys/kernel/security/lockdown $ echo "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig" > \ /sys/kernel/security/ima/policy $ kexec -ls unsigned-kernel Add a call to verify ima appraisal is set to "enforce" whenever lockdown is enabled. This fixes CVE-2022-21505. Cc: stable@vger.kernel.org Fixes: 29d3c1c8dfe7 ("kexec: Allow kexec_file() with appropriate IMA policy when locked down") Signed-off-by: Eric Snowberg Acked-by: Mimi Zohar Reviewed-by: John Haxby Signed-off-by: Linus Torvalds Signed-off-by: GUO Zihua Reviewed-by: Xiu Jianfeng Reviewed-by: Wang Weiyang Signed-off-by: Zheng Zengkai --- security/integrity/ima/ima_policy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 5a06050729a7..b1ab4b3d99fb 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1900,6 +1900,10 @@ bool ima_appraise_signature(enum kernel_read_file_id id) if (id >= READING_MAX_ID) return false; + if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE) + && security_locked_down(LOCKDOWN_KEXEC)) + return false; + func = read_idmap[id] ?: FILE_CHECK; rcu_read_lock(); -- Gitee From 6658b33b26920a82df3203c6ecefd00b5820e921 Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Thu, 28 Jul 2022 18:06:41 +0800 Subject: [PATCH 108/132] ubifs: Fix the issue that UBIFS be read-only due to truncate in the encrypted directory hulk inclusion category: bugfix bugzilla: 187163, https://gitee.com/openeuler/kernel/issues/I5GBC4 CVE: NA -------------------------------- The ubifs_compress() function does not compress the data When the data length is short than 128 bytes or the compressed data length is not ideal.It cause that the compressed length of the truncated data in the truncate_data_node() function may be greater than the length of the raw data read from the flash. The above two lengths are transferred to the ubifs_encrypt() function as parameters. This may lead to assertion fails and then the file system becomes read-only. This patch use the actual length of the data in the memory as the input parameter for assert comparison, which avoids the problem. Signed-off-by: ZhaoLong Wang Reviewed-by: zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/ubifs/crypto.c | 11 +++++++++++ fs/ubifs/journal.c | 29 +++++++++++++++++------------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4..1dc22cf29c65 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } +/** + * ubifs_encrypt - Encrypt data. + * @inode: inode which refers to the data node + * @dn: data node to encrypt + * @in_len: length of data to be compressed + * @out_len: allocated memory size for the data area of @dn + * @block: logical block number of the block + * + * This function encrypt a possibly-compressed data in the data node. + * The encrypted data length will store in @out_len. + */ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 72586512e51f..ee9888087983 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1472,23 +1472,25 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, * @block: data block number * @dn: data node to re-compress * @new_len: new length + * @dn_size: size of the data node @dn in memory * * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed/encrypted and re-written. */ static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, unsigned int block, struct ubifs_data_node *dn, - int *new_len) + int *new_len, int dn_size) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + data_size = dn_size - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); if (IS_ENCRYPTED(inode)) { @@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in } if (IS_ENCRYPTED(inode)) { - err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + err = ubifs_encrypt(inode, dn, out_len, &data_size, block); if (err) goto out; - out_len = old_dlen; + out_len = data_size; } else { dn->compr_size = 0; } @@ -1549,7 +1551,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_ino_node *ino; struct ubifs_trun_node *trun; struct ubifs_data_node *dn; - int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + int err, dlen, len, lnum, offs, bit, sz, dn_size, sync = IS_SYNC(inode); struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; @@ -1562,10 +1564,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_assert(c, S_ISREG(inode->i_mode)); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); - sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + - UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + dn_size = COMPRESSED_DATA_NODE_BUF_SZ; + + if (IS_ENCRYPTED(inode)) + dn_size += UBIFS_CIPHER_BLOCK_SIZE; - sz += ubifs_auth_node_sz(c); + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + dn_size + ubifs_auth_node_sz(c); ino = kmalloc(sz, GFP_NOFS); if (!ino) @@ -1596,15 +1601,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - - UBIFS_TRUN_NODE_SZ); + ubifs_dump_node(c, dn, dn_size); goto out_free; } if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { - err = truncate_data_node(c, inode, blk, dn, &dlen); + err = truncate_data_node(c, inode, blk, dn, + &dlen, dn_size); if (err) goto out_free; } -- Gitee From baf97eb4ed59819bb8e2d59570057de458a351d1 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Thu, 28 Jul 2022 18:06:42 +0800 Subject: [PATCH 109/132] ubifs: Fix AA deadlock when setting xattr for encrypted file hulk inclusion category: bugfix bugzilla: 187250, https://gitee.com/openeuler/kernel/issues/I5HSMS CVE: NA ------------------------------------------------- Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng Reviewed-by: Zhang Yi Signed-off-by: Zheng Zengkai --- fs/ubifs/dir.c | 25 ++++++++++++++----------- fs/ubifs/ubifs.h | 2 +- fs/ubifs/xattr.c | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f5777f59a101..acd7e83a35e4 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @c: UBIFS file-system description object * @dir: parent directory inode * @mode: inode mode flags + * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and * initializes it. Returns new inode in case of success and an error code in * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode) + umode_t mode, bool is_xattr) { int err; struct inode *inode; @@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; - err = fscrypt_prepare_new_inode(dir, inode, &encrypted); - if (err) { - ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); - goto out_iput; + if (!is_xattr) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } } switch (mode & S_IFMT) { @@ -308,7 +311,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -369,7 +372,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) if (err) return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_free; @@ -461,7 +464,7 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, return err; } - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; @@ -1002,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -1089,7 +1092,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); @@ -1171,7 +1174,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 9a4a3191ed07..15cfee7c1125 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1996,7 +1996,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode); + umode_t mode, bool is_xattr); int ubifs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 258b97939ba2..81efe638acd5 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; -- Gitee From 156e60a42ffffe594f7cf38df51ee07b7f0fc99c Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 28 Jul 2022 18:06:43 +0800 Subject: [PATCH 110/132] Revert "mm/dynamic_hugetlb: disable dynamic hugetlb if hugetlb_vmemmap is enabled" hulk inclusion category: bugfix bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA -------------------------------- Will disable hugetlb_vmemmap when dynamic hugetlb is enabled in later patch. This reverts commit c7ae7c0dd37aa112f9cb3e23878f4c5fe67f86bc. Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/dynamic_hugetlb.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index eb9b528b73de..f8ebc8ab7d60 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -1133,17 +1133,6 @@ void __init dynamic_hugetlb_init(void) if (!enable_dhugetlb) return; - /* - * The dynamic_hugetlb feature need to split and merge pages frequently. - * hugetlb_vmemmap will affects the perforemance of page split and merge. - * If want to use dynamic hugetlb, please disable hugetlb_vmemmap. - */ - if (hugetlb_free_vmemmap_enabled) { - enable_dhugetlb = false; - pr_info("Please set hugetlb_free_vmemmap=off if want to enable dynamic hugetlb\n"); - return; - } - count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT); size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *); dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL); -- Gitee From 2706bcc0550ce47604e18a68f6bd8d0f78f0136d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Jul 2022 18:06:44 +0800 Subject: [PATCH 111/132] jump_label: Provide CONFIG-driven build state defaults mainline inclusion from mainline-v5.13-rc1 commit 0d66ccc1627013c95f1e7ef10b95b8451cd7834e category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0d66ccc1627013c95f1e7ef10b95b8451cd7834e -------------------------------- As shown in the comment in jump_label.h, choosing the initial state of static branches changes the assembly layout. If the condition is expected to be likely it's inline, and if unlikely it is out of line via a jump. A few places in the kernel use (or could be using) a CONFIG to choose the default state, which would give a small performance benefit to their compile-time declared default. Provide the infrastructure to do this. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210401232347.2791257-2-keescook@chromium.org Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/jump_label.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7e1dce5670fc..d01c23025af0 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -388,6 +388,21 @@ struct static_key_false { [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } +#define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) +#define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) +#define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ + __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) + +#define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) +#define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) +#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ + __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) + +#define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) +#define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) +#define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ + __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) + extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ @@ -488,6 +503,10 @@ extern bool ____wrong_branch_error(void); #endif /* CONFIG_JUMP_LABEL */ +#define static_branch_maybe(config, x) \ + (IS_ENABLED(config) ? static_branch_likely(x) \ + : static_branch_unlikely(x)) + /* * Advanced usage; refcount, branch is enabled when: count != 0 */ -- Gitee From 2722aa58c0fe3bd8c73b80786b1120e317c41365 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Jul 2022 18:06:45 +0800 Subject: [PATCH 112/132] mm: make compound_head const-preserving mainline inclusion from mainline-v5.14-rc1 commit 0f2317e34e2c7b97efd4600122115410795ebeea category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0f2317e34e2c7b97efd4600122115410795ebeea -------------------------------- If you pass a const pointer to compound_head(), you get a const pointer back; if you pass a mutable pointer, you get a mutable pointer back. Also remove an unnecessary forward definition of struct page; we're about to dereference page->compound_head, so it must already have been defined. Link: https://lkml.kernel.org/r/20210416231531.2521383-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/page-flags.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 65e1cbe1d1ce..b40b4e0ada31 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,17 +194,17 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H -struct page; /* forward declaration */ - -static inline struct page *compound_head(struct page *page) +static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) - return (struct page *) (head - 1); - return page; + return head - 1; + return (unsigned long)page; } +#define compound_head(page) ((typeof(page))_compound_head(page)) + static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; -- Gitee From fe0aed0230fbd8246fb028d126548fa69b4f9bf4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:46 +0800 Subject: [PATCH 113/132] mm: hugetlb: free the 2nd vmemmap page associated with each HugeTLB page mainline inclusion from mainline-v5.18-rc1 commit e7d324850bfcb30df563d144c0363cc44595277d category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e7d324850bfcb30df563d144c0363cc44595277d -------------------------------- Patch series "Free the 2nd vmemmap page associated with each HugeTLB page", v7. This series can minimize the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB. It is a nice gain. Comments and reviews are welcome. Thanks. The main implementation and details can refer to the commit log of patch 1. In this series, I have changed the following four helpers, the following table shows the impact of the overhead of those helpers. +------------------+-----------------------+ | APIs | head page | tail page | +------------------+-----------+-----------+ | PageHead() | Y | N | +------------------+-----------+-----------+ | PageTail() | Y | N | +------------------+-----------+-----------+ | PageCompound() | N | N | +------------------+-----------+-----------+ | compound_head() | Y | N | +------------------+-----------+-----------+ Y: Overhead is increased. N: Overhead is _NOT_ increased. It shows that the overhead of those helpers on a tail page don't change between "hugetlb_free_vmemmap=on" and "hugetlb_free_vmemmap=off". But the overhead on a head page will be increased when "hugetlb_free_vmemmap=on" (except PageCompound()). So I believe that Matthew Wilcox's folio series will help with this. The users of PageHead() and PageTail() are much less than compound_head() and most users of PageTail() are VM_BUG_ON(), so I have done some tests about the overhead of compound_head() on head pages. I have tested the overhead of calling compound_head() on a head page, which is 2.11ns (Measure the call time of 10 million times compound_head(), and then average). For a head page whose address is not aligned with PAGE_SIZE or a non-compound page, the overhead of compound_head() is 2.54ns which is increased by 20%. For a head page whose address is aligned with PAGE_SIZE, the overhead of compound_head() is 2.97ns which is increased by 40%. Most pages are the former. I do not think the overhead is significant since the overhead of compound_head() itself is low. This patch (of 5): This patch minimizes the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB (2MB type). After the feature of "Free sonme vmemmap pages of HugeTLB page" is enabled, the mapping of the vmemmap addresses associated with a 2MB HugeTLB page becomes the figure below. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | -------------> | 1 | | | +-----------+ +-----------+ | | | 2 | ----------------^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | 3 | ------------------+ | | | | | | +-----------+ | | | | | | | 4 | --------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | ----------------------+ | | | | +-----------+ | | | | | 6 | ------------------------+ | | | +-----------+ | | | | 7 | --------------------------+ | | +-----------+ | | | | | | +-----------+ As we can see, the 2nd vmemmap page frame (indexed by 1) is reused and remaped. However, the 2nd vmemmap page frame is also can be freed to the buddy allocator, then we can change the mapping from the figure above to the figure below. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | | 2 | -----------------+ | | | | | | | +-----------+ | | | | | | | | 3 | -------------------+ | | | | | | +-----------+ | | | | | | | 4 | ---------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | -----------------------+ | | | | +-----------+ | | | | | 6 | -------------------------+ | | | +-----------+ | | | | 7 | ---------------------------+ | | +-----------+ | | | | | | +-----------+ After we do this, all tail vmemmap pages (1-7) are mapped to the head vmemmap page frame (0). In other words, there are more than one page struct with PG_head associated with each HugeTLB page. We __know__ that there is only one head page struct, the tail page structs with PG_head are fake head page structs. We need an approach to distinguish between those two different types of page structs so that compound_head(), PageHead() and PageTail() can work properly if the parameter is the tail page struct but with PG_head. The following code snippet describes how to distinguish between real and fake head page struct. if (test_bit(PG_head, &page->flags)) { unsigned long head = READ_ONCE(page[1].compound_head); if (head & 1) { if (head == (unsigned long)page + 1) ==> head page struct else ==> tail page struct } else ==> head page struct } We can safely access the field of the @page[1] with PG_head because the @page is a compound page composed with at least two contiguous pages. [songmuchun@bytedance.com: restore lost comment changes] Link: https://lkml.kernel.org/r/20211101031651.75851-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20211101031651.75851-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Mike Kravetz Cc: Oscar Salvador Cc: Michal Hocko Cc: David Hildenbrand Cc: Chen Huang Cc: Bodeddula Balasubramaniam Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Xiongchun Duan Cc: Fam Zheng Cc: Qi Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: include/linux/page-flags.h Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- .../admin-guide/kernel-parameters.txt | 2 +- include/linux/page-flags.h | 73 ++++++++++++++++++- mm/hugetlb_vmemmap.c | 62 +++++++++------- mm/sparse-vmemmap.c | 21 ++++++ 4 files changed, 125 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b04cf8fbab4..e0957b73f63d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1596,7 +1596,7 @@ [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more - memory (6 * PAGE_SIZE for each 2MB hugetlb page). + memory (7 * PAGE_SIZE for each 2MB hugetlb page). Format: { on | off (default) } on: enable the feature diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b40b4e0ada31..7b31a81be7e1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,25 +194,82 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +extern bool hugetlb_free_vmemmap_enabled; + +/* + * If the feature of freeing some vmemmap pages associated with each HugeTLB + * page is enabled, the head vmemmap page frame is reused and all of the tail + * vmemmap addresses map to the head vmemmap page frame (furture details can + * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other + * words, there are more than one page struct with PG_head associated with each + * HugeTLB page. We __know__ that there is only one head page struct, the tail + * page structs with PG_head are fake head page structs. We need an approach + * to distinguish between those two different types of page structs so that + * compound_head() can return the real head page struct when the parameter is + * the tail page struct but with PG_head. + * + * The page_fixed_fake_head() returns the real head page struct if the @page is + * fake page head, otherwise, returns @page which can either be a true page + * head or tail. + */ +static __always_inline const struct page *page_fixed_fake_head(const struct page *page) +{ + if (!hugetlb_free_vmemmap_enabled) + return page; + + /* + * Only addresses aligned with PAGE_SIZE of struct page may be fake head + * struct page. The alignment check aims to avoid access the fields ( + * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * cold cacheline in some cases. + */ + if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && + test_bit(PG_head, &page->flags)) { + /* + * We can safely access the field of the @page[1] with PG_head + * because the @page is a compound page composed with at least + * two contiguous pages. + */ + unsigned long head = READ_ONCE(page[1].compound_head); + + if (likely(head & 1)) + return (const struct page *)(head - 1); + } + return page; +} +#else +static inline const struct page *page_fixed_fake_head(const struct page *page) +{ + return page; +} +#endif + +static __always_inline int page_is_fake_head(struct page *page) +{ + return page_fixed_fake_head(page) != page; +} + static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; - return (unsigned long)page; + return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) static __always_inline int PageTail(struct page *page) { - return READ_ONCE(page->compound_head) & 1; + return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(struct page *page) { - return test_bit(PG_head, &page->flags) || PageTail(page); + return test_bit(PG_head, &page->flags) || + READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l @@ -600,7 +657,15 @@ static inline void set_page_writeback_keepwrite(struct page *page) test_set_page_writeback_keepwrite(page); } -__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +static __always_inline int PageHead(struct page *page) +{ + PF_POISONED_CHECK(page); + return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); +} + +__SETPAGEFLAG(Head, head, PF_ANY) +__CLEARPAGEFLAG(Head, head, PF_ANY) +CLEARPAGEFLAG(Head, head, PF_ANY) static __always_inline void set_compound_head(struct page *page, struct page *head) { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c540c21e26f5..4977f5a520c2 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -124,9 +124,9 @@ * page of page structs (page 0) associated with the HugeTLB page contains the 4 * page structs necessary to describe the HugeTLB. The only use of the remaining * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs * will be used for each HugeTLB page. This will allow us to free the remaining - * 6 pages to the buddy allocator. + * 7 pages to the buddy allocator. * * Here is how things look after remapping. * @@ -134,30 +134,30 @@ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ * | | | 0 | -------------> | 0 | * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | ----------------^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | - * | | | 3 | ------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | --------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | ----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | ------------------------+ | - * | | +-----------+ | - * | | | 7 | --------------------------+ + * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | | + * | | | 2 | -----------------+ | | | | | + * | | +-----------+ | | | | | + * | | | 3 | -------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | ---------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | -----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | -------------------------+ | + * | | +-----------+ | + * | | | 7 | ---------------------------+ * | | +-----------+ * | | * | | * | | * +-----------+ * - * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for * vmemmap pages and restore the previous mapping relationship. * * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. * * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures * (e.g. aarch64) provides a contiguous bit in the translation table entries @@ -166,7 +166,13 @@ * * The contiguous bit is used to increase the mapping size at the pmd and pte * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 2 pages. + * size of the struct page structs is greater than 1 page. + * + * Notice: The head vmemmap page is not freed to the buddy allocator and all + * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see + * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) + * associated with each HugeTLB page. The compound_head() can handle this + * correctly (more details refer to the comment above compound_head()). */ #define pr_fmt(fmt) "HugeTLB: " fmt @@ -175,19 +181,21 @@ /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first - * page of tail page structures. We map the virtual addresses of the remaining - * pages of tail page structures to the first tail page struct, and then free - * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + * page of head page structures. We map the virtual addresses of all the pages + * of tail page structures to the head page struct, and then free these page + * frames. Therefore, we need to reserve one pages as vmemmap areas. */ -#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +bool hugetlb_free_vmemmap_enabled __read_mostly = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); static int __init early_hugetlb_free_vmemmap_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ - if ((!is_power_of_2(sizeof(struct page)))) { + if (!is_power_of_2(sizeof(struct page))) { pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); return 0; } @@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) ClearHPageVmemmapOptimized(head); @@ -282,9 +289,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* - * The head page and the first tail page are not to be freed to buddy - * allocator, the other pages will map to the first tail page, so they - * can be freed. + * The head page is not to be freed to buddy allocator, the other tail + * pages will map to the head page, so they can be freed. * * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true * on some architectures (e.g. aarch64). See Documentation/arm64/ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 396a49462894..39bdbf0b28dd 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -245,6 +245,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, entry); } +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { @@ -258,6 +278,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } -- Gitee From 9b4aa808afd36ebad9567413eddf5a0db8b8441f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:47 +0800 Subject: [PATCH 114/132] mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key mainline inclusion from mainline-v5.18-rc1 commit a6b40850c442bf996e729e1d441d3dbc37cea171 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a6b40850c442bf996e729e1d441d3dbc37cea171 -------------------------------- The page_fixed_fake_head() is used throughout memory management and the conditional check requires checking a global variable, although the overhead of this check may be small, it increases when the memory cache comes under pressure. Also, the global variable will not be modified after system boot, so it is very appropriate to use static key machanism. Link: https://lkml.kernel.org/r/20211101031651.75851-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: mm/memory_hotplug.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/hugetlb.h | 6 ------ include/linux/page-flags.h | 16 ++++++++++++++-- mm/hugetlb_vmemmap.c | 12 ++++++------ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 634630ebc8a7..fd1dc8d29436 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1103,12 +1103,6 @@ static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, } #endif /* CONFIG_HUGETLB_PAGE */ -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; -#else -#define hugetlb_free_vmemmap_enabled false -#endif - static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7b31a81be7e1..81061122af68 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -195,7 +195,14 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); + +static __always_inline bool hugetlb_free_vmemmap_enabled(void) +{ + return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + &hugetlb_free_vmemmap_enabled_key); +} /* * If the feature of freeing some vmemmap pages associated with each HugeTLB @@ -215,7 +222,7 @@ extern bool hugetlb_free_vmemmap_enabled; */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return page; /* @@ -243,6 +250,11 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } + +static inline bool hugetlb_free_vmemmap_enabled(void) +{ + return false; +} #endif static __always_inline int page_is_fake_head(struct page *page) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4977f5a520c2..791626983c2e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,9 +188,9 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled __read_mostly = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); static int __init early_hugetlb_free_vmemmap_param(char *buf) { @@ -204,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - hugetlb_free_vmemmap_enabled = true; + static_branch_enable(&hugetlb_free_vmemmap_enabled_key); else if (!strcmp(buf, "off")) - hugetlb_free_vmemmap_enabled = false; + static_branch_disable(&hugetlb_free_vmemmap_enabled_key); else return -EINVAL; @@ -284,7 +284,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; -- Gitee From b31b84bf5c6d282b2e3bdcf8dc6df4f395efcb19 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:48 +0800 Subject: [PATCH 115/132] mm: sparsemem: use page table lock to protect kernel pmd operations mainline inclusion from mainline-v5.18-rc1 commit d8d55f5616cf3b900a23a72dd24e7b07211e7859 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8d55f5616cf3b900a23a72dd24e7b07211e7859 -------------------------------- The init_mm.page_table_lock is used to protect kernel page tables, we can use it to serialize splitting vmemmap PMD mappings instead of mmap write lock, which can increase the concurrency of vmemmap_remap_free(). Actually, It increase the concurrency between allocations of HugeTLB pages. But it is not the only benefit. There are a lot of users of mmap read lock of init_mm. The mmap write lock is holding through vmemmap_remap_free(), removing mmap write lock usage to make it does not affect other users of mmap read lock. It is not making anything worse and always a win to move. Now the kernel page table walker does not hold the page_table_lock when walking pmd entries. There may be consistency issue of a pmd entry, because pmd entry might change from a huge pmd entry to a PTE page table. There is only one user of kernel page table walker, namely ptdump. The ptdump already considers the consistency, which use a local variable to cache the value of pmd entry. But we also need to update ->action to ACTION_CONTINUE to make sure the walker does not walk every pte entry again when concurrent thread has split the huge pmd. Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: mm/sparse-vmemmap.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/ptdump.c | 16 +++++++++++---- mm/sparse-vmemmap.c | 47 ++++++++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/mm/ptdump.c b/mm/ptdump.c index 93f2f63dc52d..43661863096b 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -39,8 +39,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val)); - if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -59,8 +61,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val)); - if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -79,8 +83,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val)); - if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -98,8 +104,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 39bdbf0b28dd..eef9053f948f 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,8 +53,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in __pte_alloc(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in __pte_alloc(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); return 0; } +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* -- Gitee From 7d9a273f5bc8369c07ac2ba082984cb1d1e9b182 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:49 +0800 Subject: [PATCH 116/132] selftests: vm: add a hugetlb test case mainline inclusion from mainline-v5.18-rc1 commit b147c89cd429321a59147368378c8aba17c8480f category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b147c89cd429321a59147368378c8aba17c8480f -------------------------------- Since the head vmemmap page frame associated with each HugeTLB page is reused, we should hide the PG_head flag of tail struct page from the user. Add a tese case to check whether it is work properly. The test steps are as follows. 1) alloc 2MB hugeTLB 2) get each page frame 3) apply those APIs in each page frame 4) Those APIs work completely the same as before. Reading the flags of a page by /proc/kpageflags is done in stable_page_flags(), which has invoked PageHead(), PageTail(), PageCompound() and compound_head(). If those APIs work properly, the head page must have 15 and 17 bits set. And tail pages must have 16 and 17 bits set but 15 bit unset. Those flags are checked in check_page_flags(). Link: https://lkml.kernel.org/r/20211101031651.75851-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Conflicts: tools/testing/selftests/vm/Makefile tools/testing/selftests/vm/run_vmtests.sh Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/hugepage-vmemmap.c | 144 ++++++++++++++++++ tools/testing/selftests/vm/run_vmtests | 11 ++ 4 files changed, 157 insertions(+) create mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index b3a183c36cb5..905dc4efa879 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap hugepage-shm +hugepage-vmemmap khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index b150cc837177..549761bc7193 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -27,6 +27,7 @@ TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c new file mode 100644 index 000000000000..557bdbd4f87e --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include +#include +#include +#include +#include + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index d578ad831813..949c71fe5951 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -108,6 +108,17 @@ else echo "[PASS]" fi +echo "------------------------" +echo "running hugepage-vmemmap" +echo "------------------------" +./hugepage-vmemmap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." -- Gitee From 22f969cffb0be543692d6942f41cf3735fb66790 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:50 +0800 Subject: [PATCH 117/132] mm: sparsemem: move vmemmap related to HugeTLB to CONFIG_HUGETLB_PAGE_FREE_VMEMMAP mainline inclusion from mainline-v5.18-rc1 commit e54084173487804f5e2f23facf107fd9336e637e category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e54084173487804f5e2f23facf107fd9336e637e -------------------------------- The vmemmap_remap_free/alloc are relevant to HugeTLB, so move those functiongs to the scope of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP. Link: https://lkml.kernel.org/r/20211101031651.75851-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Cc: Bodeddula Balasubramaniam Cc: Chen Huang Cc: David Hildenbrand Cc: Fam Zheng Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qi Zheng Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/mm.h | 2 ++ mm/sparse-vmemmap.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 25891b581bf4..efa3972a23bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3104,10 +3104,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, gfp_t gfp_mask); +#endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index eef9053f948f..e5ed2680ec57 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,6 +34,7 @@ #include #include +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -419,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map -- Gitee From 0a5d68bb279164b7ec023203dc9f0d6a59fc17f1 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 28 Jul 2022 18:06:51 +0800 Subject: [PATCH 118/132] Revert "arm64: mm: hugetlb: add support for free vmemmap pages of HugeTLB" hulk inclusion category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO -------------------------------- There is a formal solution to support hugetlb vmemmap feature on arm64. This reverts commit 5838d235a395e221968e1f024bfaf8650278a522. Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/Kconfig b/fs/Kconfig index 6e723c90a506..37a8895a3254 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -239,7 +239,7 @@ config HUGETLB_PAGE config HUGETLB_PAGE_FREE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 || ARM64 + depends on X86_64 depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON -- Gitee From e457e43c459e8ecc1679f5f356f11c84ada79bbf Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:52 +0800 Subject: [PATCH 119/132] mm: hugetlb_vmemmap: introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP mainline inclusion from mainline-v5.19-rc1 commit 2e4ec02bbcc05b8905d65c763ebde6bc85508e90 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2e4ec02bbcc05b8905d65c763ebde6bc85508e90 -------------------------------- The feature of minimizing overhead of struct page associated with each HugeTLB page is implemented on x86_64, however, the infrastructure of this feature is already there, we could easily enable it for other architectures. Introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP for other architectures to be easily enabled. Just select this config if they want to enable this feature. Link: https://lkml.kernel.org/r/20220331065640.5777-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Suggested-by: Andrew Morton Reviewed-by: Barry Song Tested-by: Barry Song Reviewed-by: Anshuman Khandual Cc: Bodeddula Balasubramaniam Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Fam Zheng Cc: James Morse Cc: Mark Rutland Cc: Mike Kravetz Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- arch/x86/Kconfig | 1 + fs/Kconfig | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d17396ef4323..661e05e1f762 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,6 +103,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_TABLE_SORT diff --git a/fs/Kconfig b/fs/Kconfig index 37a8895a3254..b60a7614cb16 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -237,9 +237,17 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + bool + config HUGETLB_PAGE_FREE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 + depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON -- Gitee From de126e063c1e89c698fe2304d0e590f69157ac66 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:53 +0800 Subject: [PATCH 120/132] arm64: mm: hugetlb: enable HUGETLB_PAGE_FREE_VMEMMAP for arm64 mainline inclusion from mainline-v5.19-rc1 commit 1e63ac088f20f7a4425c430c31ecd3cf167fb3f2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1e63ac088f20f7a4425c430c31ecd3cf167fb3f2 -------------------------------- The feature of minimizing overhead of struct page associated with each HugeTLB page aims to free its vmemmap pages (used as struct page) to save memory, where is ~14GB/16GB per 1TB HugeTLB pages (2MB/1GB type). In short, when a HugeTLB page is allocated or freed, the vmemmap array representing the range associated with the page will need to be remapped. When a page is allocated, vmemmap pages are freed after remapping. When a page is freed, previously discarded vmemmap pages must be allocated before remapping. More implementations and details can be found here [1]. The infrastructure of freeing vmemmap pages associated with each HugeTLB page is already there, we can easily enable HUGETLB_PAGE_FREE_VMEMMAP for arm64, the only thing to be fixed is flush_dcache_page() . flush_dcache_page() need to be adapted to operate on the head page's flags since the tail vmemmap pages are mapped with read-only after the feature is enabled (clear operation is not permitted). There was some discussions about this in the thread [2], but there was no conclusion in the end. And I copied the concern proposed by Anshuman to here and explain why those concern is superfluous. It is safe to enable it for x86_64 as well as arm64. 1st concern: ''' But what happens when a hot remove section's vmemmap area (which is being teared down) is nearby another vmemmap area which is either created or being destroyed for HugeTLB alloc/free purpose. As you mentioned HugeTLB pages inside the hot remove section might be safe. But what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ? Massive HugeTLB alloc /use/free test cycle using memory just adjacent to a memory hotplug area, which is always added and removed periodically, should be able to expose this problem. ''' Answer: At the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is no race between memory hot remove and free_huge_page_vmemmap(). Therefore, HugeTLB pages inside the hot remove section is safe. Let's talk your question "what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ?", the question is not established. The minimal granularity size of hotplug memory 128MB (on arm64, 4k base page), any HugeTLB smaller than 128MB is within a section, then, there is no share PTE page tables between HugeTLB in this section and ones in other sections and a HugeTLB page could not cross two sections. In this case, the section cannot be freed. Any HugeTLB bigger than 128MB (section size) whose vmemmap pages is an integer multiple of 2MB (PMD-mapped). As long as: 1) HugeTLBs are naturally aligned, power-of-two sizes 2) The HugeTLB size >= the section size 3) The HugeTLB size >= the vmemmap leaf mapping size Then a HugeTLB will not share any leaf page table entries with *anything else*, but will share intermediate entries. In this case, at the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is also no race between memory hot remove and free_huge_page_vmemmap(). 2nd concern: ''' differently, not sure if ptdump would require any synchronization. Dumping an wrong value is probably okay but crashing because a page table entry is being freed after ptdump acquired the pointer is bad. On arm64, ptdump() is protected against hotremove via [get|put]_online_mems(). ''' Answer: The ptdump should be fine since vmemmap_remap_free() only exchanges PTEs or splits the PMD entry (which means allocating a PTE page table). Both operations do not free any page tables (PTE), so ptdump cannot run into a UAF on any page tables. The worst case is just dumping an wrong value. [1] https://lore.kernel.org/all/20210510030027.56044-1-songmuchun@bytedance.com/ [2] https://lore.kernel.org/all/20210518091826.36937-1-songmuchun@bytedance.com/ [songmuchun@bytedance.com: restructure the code comment inside flush_dcache_page()] Link: https://lkml.kernel.org/r/20220414072646.21910-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220331065640.5777-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Barry Song Tested-by: Barry Song Cc: Will Deacon Cc: David Hildenbrand Cc: Bodeddula Balasubramaniam Cc: Oscar Salvador Cc: Mike Kravetz Cc: David Rientjes Cc: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Xiongchun Duan Cc: Fam Zheng Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- arch/arm64/Kconfig | 1 + arch/arm64/mm/flush.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e253fdba1249..d9da5c4f91e0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,6 +81,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 145fe60c5e68..2bb6defad92f 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_page(struct page *page) { + /* + * Only the head page's flags of HugeTLB can be cleared since the tail + * vmemmap pages associated with each HugeTLB page are mapped with + * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * details can refer to vmemmap_remap_pte()). Although + * __sync_icache_dcache() only set PG_dcache_clean flag on the head + * page struct, there is more than one page struct with PG_dcache_clean + * associated with the HugeTLB page since the head vmemmap page frame + * is reused (more details can refer to the comments above + * page_fixed_fake_head()). + */ + if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + page = compound_head(page); + if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } -- Gitee From b8dfc94bd51821c5467f622ffec51c5bfd37c342 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:54 +0800 Subject: [PATCH 121/132] mm: hugetlb_vmemmap: cleanup hugetlb_vmemmap related functions mainline inclusion from mainline-v5.19-rc1 commit 5981611d0a006472d367d7a8e6ead8afaecf17c7 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5981611d0a006472d367d7a8e6ead8afaecf17c7 -------------------------------- Patch series "cleanup hugetlb_vmemmap". The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize" is more clear. In this series, cheanup related codes to make it more clear and expressive. This is suggested by David. This patch (of 3): The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". And some function names are prefixed with "huge_page" instead of "hugetlb", it is easily to be confused with THP. In this patch, cheanup related functions to make code more clear and expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220404074652.68024-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton Conflicts: mm/hugetlb.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 8 ++++---- mm/hugetlb_vmemmap.c | 42 ++++++++++++++++++++--------------------- mm/hugetlb_vmemmap.h | 20 ++++++++++---------- 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fd1dc8d29436..218fc150d7f8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -593,7 +593,7 @@ struct hstate { unsigned int surplus_huge_pages_node[MAX_NUMNODES]; unsigned int resv_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP - unsigned int nr_free_vmemmap_pages; + unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c815386ecc..c5168c7f282a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1448,7 +1448,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - if (alloc_huge_page_vmemmap(h, page)) { + if (hugetlb_vmemmap_alloc(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1519,7 +1519,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { - if (free_vmemmap_pages_per_hpage(h)) + if (hugetlb_optimize_vmemmap_pages(h)) flush_work(&free_hpage_work); } @@ -1642,7 +1642,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { - free_huge_page_vmemmap(h, page); + hugetlb_vmemmap_free(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -2066,7 +2066,7 @@ int dissolve_free_huge_page(struct page *page) * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = alloc_huge_page_vmemmap(h, head); + rc = hugetlb_vmemmap_alloc(h, head); if (!rc) { /* * Move PageHWPoison flag from head page to the raw diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 791626983c2e..91b79b9d9e25 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -192,7 +192,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, hugetlb_free_vmemmap_enabled_key); EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); -static int __init early_hugetlb_free_vmemmap_param(char *buf) +static int __init hugetlb_vmemmap_early_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ if (!is_power_of_2(sizeof(struct page))) { @@ -212,29 +212,26 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return 0; } -early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param); - -static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) -{ - return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; -} +early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); /* * Previously discarded vmemmap pages will be allocated and remapping * after this function returns zero. */ -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { int ret; unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + /* * The pages which the vmemmap virtual address range [@vmemmap_addr, * @vmemmap_end) are mapped to are freed to the buddy allocator, and @@ -250,17 +247,18 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) return ret; } -void free_huge_page_vmemmap(struct hstate *h, struct page *head) +void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; - if (!free_vmemmap_pages_per_hpage(h)) + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + if (!vmemmap_pages) return; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) @@ -297,8 +295,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) * hugetlbpage.rst for more details. */ if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; - pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, - h->name); + pr_info("can optimize %d vmemmap pages for %s\n", + h->optimize_vmemmap_pages, h->name); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index cb2bef8f9e73..9760537849b5 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -11,25 +11,25 @@ #include #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); -void free_huge_page_vmemmap(struct hstate *h, struct page *head); +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); +void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); /* - * How many vmemmap pages associated with a HugeTLB page that can be freed - * to the buddy allocator. + * How many vmemmap pages associated with a HugeTLB page that can be + * optimized and freed to the buddy allocator. */ -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { - return h->nr_free_vmemmap_pages; + return h->optimize_vmemmap_pages; } #else -static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { return 0; } -static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { } @@ -37,7 +37,7 @@ static inline void hugetlb_vmemmap_init(struct hstate *h) { } -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -- Gitee From 142ca734a4a1aab74f8c37818b634f996ef49e08 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:55 +0800 Subject: [PATCH 122/132] mm: hugetlb_vmemmap: cleanup hugetlb_free_vmemmap_enabled* mainline inclusion from mainline-v5.19-rc1 commit f10f1442c309ccef7a80ba3dc4abde0978e86fb4 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f10f1442c309ccef7a80ba3dc4abde0978e86fb4 -------------------------------- The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup the static key and hugetlb_free_vmemmap_enabled() to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton Conflicts: mm/memory_hotplug.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- arch/arm64/mm/flush.c | 2 +- include/linux/page-flags.h | 12 ++++++------ mm/hugetlb_vmemmap.c | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 2bb6defad92f..892e53e9c788 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -80,7 +80,7 @@ void flush_dcache_page(struct page *page) * is reused (more details can refer to the comments above * page_fixed_fake_head()). */ - if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page)) page = compound_head(page); if (test_bit(PG_dcache_clean, &page->flags)) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 81061122af68..d1cf17a71a12 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -196,16 +196,16 @@ enum pageflags { #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); -static __always_inline bool hugetlb_free_vmemmap_enabled(void) +static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - &hugetlb_free_vmemmap_enabled_key); + &hugetlb_optimize_vmemmap_key); } /* - * If the feature of freeing some vmemmap pages associated with each HugeTLB + * If the feature of optimizing vmemmap pages associated with each HugeTLB * page is enabled, the head vmemmap page frame is reused and all of the tail * vmemmap addresses map to the head vmemmap page frame (furture details can * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other @@ -222,7 +222,7 @@ static __always_inline bool hugetlb_free_vmemmap_enabled(void) */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return page; /* @@ -251,7 +251,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) return page; } -static inline bool hugetlb_free_vmemmap_enabled(void) +static inline bool hugetlb_optimize_vmemmap_enabled(void) { return false; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 91b79b9d9e25..f25294973398 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -189,8 +189,8 @@ #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { @@ -204,9 +204,9 @@ static int __init hugetlb_vmemmap_early_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - static_branch_enable(&hugetlb_free_vmemmap_enabled_key); + static_branch_enable(&hugetlb_optimize_vmemmap_key); else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_free_vmemmap_enabled_key); + static_branch_disable(&hugetlb_optimize_vmemmap_key); else return -EINVAL; @@ -282,7 +282,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; -- Gitee From f1f5face0c06f0526c21122eb601f4e5482d612b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:56 +0800 Subject: [PATCH 123/132] mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP* mainline inclusion from mainline-v5.19-rc1 commit 47010c040dec8af6347ec6259104fc13f7e7e30a category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47010c040dec8af6347ec6259104fc13f7e7e30a -------------------------------- The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup configs to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Mike Kravetz Cc: David Hildenbrand Signed-off-by: Andrew Morton Conflicts: arch/arm64/configs/openeuler_defconfig arch/x86/configs/openeuler_defconfig Documentation/admin-guide/kernel-parameters.txt include/linux/hugetlb.h mm/Makefile Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- Documentation/admin-guide/kernel-parameters.txt | 5 ++++- Documentation/admin-guide/mm/hugetlbpage.rst | 2 +- arch/arm64/Kconfig | 2 +- arch/arm64/configs/openeuler_defconfig | 4 ++-- arch/arm64/mm/flush.c | 2 +- arch/x86/Kconfig | 2 +- arch/x86/configs/openeuler_defconfig | 4 ++-- arch/x86/mm/init_64.c | 2 +- fs/Kconfig | 16 ++++++++-------- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 2 +- include/linux/page-flags.h | 6 +++--- mm/Makefile | 2 +- mm/hugetlb_vmemmap.c | 4 ++-- mm/hugetlb_vmemmap.h | 4 ++-- mm/sparse-vmemmap.c | 4 ++-- 16 files changed, 33 insertions(+), 30 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e0957b73f63d..86a7ea4a9964 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1593,7 +1593,7 @@ Format: size[KMG] hugetlb_free_vmemmap= - [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP + [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). @@ -1602,6 +1602,9 @@ on: enable the feature off: disable the feature + Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, + the default is on. + hugevmalloc [KNL,PPC,ARM64,X86] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC Format: { on | off } Default set by CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED. diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index d70828c07658..0f8acc4a6cf0 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -164,7 +164,7 @@ default_hugepagesz will all result in 256 2M huge pages being allocated. Valid default huge page size is architecture dependent. hugetlb_free_vmemmap - When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing + When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing unused vmemmap pages associated with each HugeTLB page. When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d9da5c4f91e0..c4f6c80ea976 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,7 +81,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index d246fd508ef6..593ac67497e3 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6281,8 +6281,8 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 892e53e9c788..c7678e7df53a 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -72,7 +72,7 @@ void flush_dcache_page(struct page *page) /* * Only the head page's flags of HugeTLB can be cleared since the tail * vmemmap pages associated with each HugeTLB page are mapped with - * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more * details can refer to vmemmap_remap_pte()). Although * __sync_icache_dcache() only set PG_dcache_clean flag on the head * page struct, there is more than one page struct with PG_dcache_clean diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 661e05e1f762..f39cfb5a6535 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,7 +103,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_TABLE_SORT diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 3eac70518e6f..f013c7b95881 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7370,8 +7370,8 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_DYNAMIC_HUGETLB=y CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1eed5eaee41f..b1d5c05aeca8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1225,7 +1225,7 @@ static struct kcore_list kcore_vsyscall; static void __init register_page_bootmem_info(void) { -#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) int i; for_each_online_node(i) diff --git a/fs/Kconfig b/fs/Kconfig index b60a7614cb16..aa097ca64ef6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -242,22 +242,22 @@ config HUGETLB_PAGE # to enable the feature of minimizing overhead of struct page associated with # each HugeTLB page. # -config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP bool -config HUGETLB_PAGE_FREE_VMEMMAP +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off. config DYNAMIC_HUGETLB diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 218fc150d7f8..0dfe08439095 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -592,7 +592,7 @@ struct hstate { unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; unsigned int resv_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB diff --git a/include/linux/mm.h b/include/linux/mm.h index efa3972a23bf..1ae73cc4b806 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3104,7 +3104,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d1cf17a71a12..26b36cac9307 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,13 +194,13 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { - return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, &hugetlb_optimize_vmemmap_key); } diff --git a/mm/Makefile b/mm/Makefile index d2a6a786f915..e83233177c7a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o -obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o +obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_DYNAMIC_HUGETLB) += dynamic_hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f25294973398..2655434a946b 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,7 +188,7 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); @@ -276,7 +276,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) /* * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, + * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. */ BUILD_BUG_ON(__NR_USED_SUBPAGE >= diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 9760537849b5..109b0a53b6fe 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -10,7 +10,7 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); @@ -41,5 +41,5 @@ static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index e5ed2680ec57..5b40a7473dc8 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,7 +34,7 @@ #include #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -420,7 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map -- Gitee From 301b00fff3e70b91a9bc47af2cabce76f48cdb75 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Thu, 28 Jul 2022 18:06:57 +0800 Subject: [PATCH 124/132] sysctl: add a new register_sysctl_init() interface mainline inclusion from mainline-v5.17-rc1 commit 3ddd9a808cee7284931312f2f3e854c9617f44b2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ddd9a808cee7284931312f2f3e854c9617f44b2 -------------------------------- Patch series "sysctl: first set of kernel/sysctl cleanups", v2. Finally had time to respin the series of the work we had started last year on cleaning up the kernel/sysct.c kitchen sink. People keeps stuffing their sysctls in that file and this creates a maintenance burden. So this effort is aimed at placing sysctls where they actually belong. I'm going to split patches up into series as there is quite a bit of work. This first set adds register_sysctl_init() for uses of registerting a sysctl on the init path, adds const where missing to a few places, generalizes common values so to be more easy to share, and starts the move of a few kernel/sysctl.c out where they belong. The majority of rework on v2 in this first patch set is 0-day fixes. Eric Biederman's feedback is later addressed in subsequent patch sets. I'll only post the first two patch sets for now. We can address the rest once the first two patch sets get completely reviewed / Acked. This patch (of 9): The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Today though folks heavily rely on tables on kernel/sysctl.c so they can easily just extend this table with their needed sysctls. In order to help users move their sysctls out we need to provide a helper which can be used during code initialization. We special-case the initialization use of register_sysctl() since it *is* safe to fail, given all that sysctls do is provide a dynamic interface to query or modify at runtime an existing variable. So the use case of register_sysctl() on init should *not* stop if the sysctls don't end up getting registered. It would be counter productive to stop boot if a simple sysctl registration failed. Provide a helper for init then, and document the recommended init levels to use for callers of this routine. We will later use this in subsequent patches to start slimming down kernel/sysctl.c tables and moving sysctl registration to the code which actually needs these sysctls. [mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ] Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Iurii Zaikin Cc: "Eric W. Biederman" Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: Andy Shevchenko Cc: Sebastian Reichel Cc: Tetsuo Handa Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Qing Wang Cc: Benjamin LaHaise Cc: Al Viro Cc: Jan Kara Cc: Amir Goldstein Cc: Stephen Kitt Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- fs/proc/proc_sysctl.c | 33 +++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 3 +++ 2 files changed, 36 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ffed75f833b7..df435cd91a5b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1380,6 +1381,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab } EXPORT_SYMBOL(register_sysctl); +/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base + * @table: This is the sysctl table that needs to be registered to the path + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: Can only be called after your respective sysctl base path has been + * registered. So for instance, most base directories are registered early on + * init before init levels are processed through proc_sys_init() and + * sysctl_init(). + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + static char *append_path(const char *path, char *pos, const char *name) { int namelen; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 51298a4f4623..161eba9fd912 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -195,6 +195,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name); +#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) void do_sysctl_args(void); extern int pwrsw_enabled; -- Gitee From 9e38aa2f71a1de04192e4c55f788f9c36f27d0a0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:58 +0800 Subject: [PATCH 125/132] mm: hugetlb_vmemmap: disable hugetlb_optimize_vmemmap when struct page crosses page boundaries mainline inclusion from mainline-v5.19-rc1 commit 0effdf461c5789be02d40c1868c70cc02ea24627 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0effdf461c5789be02d40c1868c70cc02ea24627 -------------------------------- Patch series "add hugetlb_optimize_vmemmap sysctl", v11. This series aims to add hugetlb_optimize_vmemmap sysctl to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. This patch (of 4): If the size of "struct page" is not the power of two but with the feature of minimizing overhead of struct page associated with each HugeTLB is enabled, then the vmemmap pages of HugeTLB will be corrupted after remapping (panic is about to happen in theory). But this only exists when !CONFIG_MEMCG && !CONFIG_SLUB on x86_64. However, it is not a conventional configuration nowadays. So it is not a real word issue, just the result of a code review. But we cannot prevent anyone from configuring that combined configure. This hugetlb_optimize_vmemmap should be disable in this case to fix this issue. Link: https://lkml.kernel.org/r/20220512041142.39501-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220512041142.39501-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/hugetlb_vmemmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 2655434a946b..4b8e59eeb971 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -194,12 +194,6 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { - /* We cannot optimize if a "struct page" crosses page boundaries. */ - if (!is_power_of_2(sizeof(struct page))) { - pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); - return 0; - } - if (!buf) return -EINVAL; @@ -285,6 +279,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h) if (!hugetlb_optimize_vmemmap_enabled()) return; + if (!is_power_of_2(sizeof(struct page))) { + pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* * The head page is not to be freed to buddy allocator, the other tail -- Gitee From 6233471f1e8239cde7f7e49a4d9545b17cab51ac Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:06:59 +0800 Subject: [PATCH 126/132] mm: hugetlb_vmemmap: use kstrtobool for hugetlb_vmemmap param parsing mainline inclusion from mainline-v5.19-rc1 commit 9c54c522bb76cbef480722bd44059e2ba8304bd2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9c54c522bb76cbef480722bd44059e2ba8304bd2 -------------------------------- Use kstrtobool rather than open coding "on" and "off" parsing in mm/hugetlb_vmemmap.c, which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off". Link: https://lkml.kernel.org/r/20220512041142.39501-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Iurii Zaikin Cc: Jonathan Corbet Cc: Kees Cook Cc: Luis Chamberlain Cc: Masahiro Yamada Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- Documentation/admin-guide/kernel-parameters.txt | 6 +++--- mm/hugetlb_vmemmap.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 86a7ea4a9964..247acf7fc837 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1597,10 +1597,10 @@ enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). - Format: { on | off (default) } + Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) } - on: enable the feature - off: disable the feature + [oO][Nn]/Y/y/1: enable the feature + [oO][Ff]/N/n/0: disable the feature Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4b8e59eeb971..112e74504905 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -194,15 +194,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static int __init hugetlb_vmemmap_early_param(char *buf) { - if (!buf) + bool enable; + + if (kstrtobool(buf, &enable)) return -EINVAL; - if (!strcmp(buf, "on")) + if (enable) static_branch_enable(&hugetlb_optimize_vmemmap_key); - else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_optimize_vmemmap_key); else - return -EINVAL; + static_branch_disable(&hugetlb_optimize_vmemmap_key); return 0; } -- Gitee From 6b2239c22b56bddcdefd09fd58b6672f2b1f1566 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:07:00 +0800 Subject: [PATCH 127/132] mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap sysctl mainline inclusion from mainline-v5.19-rc1 commit 78f39084b41d287aedb2ea55f2c1895cfa11d61a category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=78f39084b41d287aedb2ea55f2c1895cfa11d61a -------------------------------- We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and reboot the server to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. However, rebooting usually takes a long time. So add a sysctl to enable or disable the feature at runtime without rebooting. Why we need this? There are 3 use cases. 1) The feature of minimizing overhead of struct page associated with each HugeTLB is disabled by default without passing "hugetlb_free_vmemmap=on" to the boot cmdline. When we (ByteDance) deliver the servers to the users who want to enable this feature, they have to configure the grub (change boot cmdline) and reboot the servers, whereas rebooting usually takes a long time (we have thousands of servers). It's a very bad experience for the users. So we need a approach to enable this feature after rebooting. This is a use case in our practical environment. 2) Some use cases are that HugeTLB pages are allocated 'on the fly' instead of being pulled from the HugeTLB pool, those workloads would be affected with this feature enabled. Those workloads could be identified by the characteristics of they never explicitly allocating huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. We can confirm it is a real use case from the commit 099730d67417. For those workloads, the page fault time could be ~2x slower than before. We suspect those users want to disable this feature if the system has enabled this before and they don't think the memory savings benefit is enough to make up for the performance drop. 3) If the workload which wants vmemmap pages to be optimized and the workload which wants to set 'nr_overcommit_hugepages' and does not want the extera overhead at fault time when the overcommitted pages be allocated from the buddy allocator are deployed in the same server. The user could enable this feature and set 'nr_hugepages' and 'nr_overcommit_hugepages', then disable the feature. In this case, the overcommited HugeTLB pages will not encounter the extra overhead at fault time. Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Jonathan Corbet Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Oscar Salvador Cc: David Hildenbrand Cc: Masahiro Yamada Cc: Xiongchun Duan Signed-off-by: Andrew Morton Conflicts: include/linux/memory_hotplug.h mm/hugetlb_vmemmap.c mm/memory_hotplug.c Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- Documentation/admin-guide/sysctl/vm.rst | 39 +++++++++++ mm/hugetlb_vmemmap.c | 92 ++++++++++++++++++++++--- 2 files changed, 122 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index a5fbef4740c2..5de629b932ae 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -560,6 +560,45 @@ Change the minimum size of the hugepage pool. See Documentation/admin-guide/mm/hugetlbpage.rst +hugetlb_optimize_vmemmap +======================== + +This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter) +is configured or the size of 'struct page' (a structure defined in +include/linux/mm_types.h) is not power of two (an unusual system config could +result in this). + +Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages +associated with each HugeTLB page. + +Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages +per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be +optimized. When those optimized HugeTLB pages are freed from the HugeTLB pool +to the buddy allocator, the vmemmap pages representing that range needs to be +remapped again and the vmemmap pages discarded earlier need to be rellocated +again. If your use case is that HugeTLB pages are allocated 'on the fly' (e.g. +never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set +'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on +the fly') instead of being pulled from the HugeTLB pool, you should weigh the +benefits of memory savings against the more overhead (~2x slower than before) +of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy +allocator. Another behavior to note is that if the system is under heavy memory +pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB +pool to the buddy allocator since the allocation of vmemmap pages could be +failed, you have to retry later if your system encounter this situation. + +Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will not be optimized meaning the extra overhead at allocation +time from buddy allocator disappears, whereas already optimized HugeTLB pages +will not be affected. If you want to make sure there are no optimized HugeTLB +pages, you can set "nr_hugepages" to 0 first and then disable this. Note that +writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus +pages. So, those surplus pages are still optimized until they are no longer +in use. You would need to wait for those surplus pages to be released before +there are no optimized pages in the system. + + nr_hugepages_mempolicy ====================== diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 112e74504905..4340bf6c5551 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,21 +188,40 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) +enum vmemmap_optimize_mode { + VMEMMAP_OPTIMIZE_OFF, + VMEMMAP_OPTIMIZE_ON, +}; + DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); +static enum vmemmap_optimize_mode vmemmap_optimize_mode = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + +static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +{ + if (vmemmap_optimize_mode == to) + return; + + if (to == VMEMMAP_OPTIMIZE_OFF) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else + static_branch_inc(&hugetlb_optimize_vmemmap_key); + WRITE_ONCE(vmemmap_optimize_mode, to); +} + static int __init hugetlb_vmemmap_early_param(char *buf) { bool enable; + enum vmemmap_optimize_mode mode; if (kstrtobool(buf, &enable)) return -EINVAL; - if (enable) - static_branch_enable(&hugetlb_optimize_vmemmap_key); - else - static_branch_disable(&hugetlb_optimize_vmemmap_key); + mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; + vmemmap_optimize_mode_switch(mode); return 0; } @@ -235,8 +254,10 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) + if (!ret) { ClearHPageVmemmapOptimized(head); + static_branch_dec(&hugetlb_optimize_vmemmap_key); + } return ret; } @@ -250,6 +271,11 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) if (!vmemmap_pages) return; + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + return; + + static_branch_inc(&hugetlb_optimize_vmemmap_key); + vmemmap_addr += RESERVE_VMEMMAP_SIZE; vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); vmemmap_reuse = vmemmap_addr - PAGE_SIZE; @@ -259,7 +285,9 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) * to the page which @vmemmap_reuse is mapped to, then free the pages * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. */ - if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else SetHPageVmemmapOptimized(head); } @@ -276,9 +304,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_optimize_vmemmap_enabled()) - return; - if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); static_branch_disable(&hugetlb_optimize_vmemmap_key); @@ -300,3 +325,52 @@ void __init hugetlb_vmemmap_init(struct hstate *h) pr_info("can optimize %d vmemmap pages for %s\n", h->optimize_vmemmap_pages, h->name); } + +#ifdef CONFIG_PROC_SYSCTL +static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + enum vmemmap_optimize_mode mode; + static DEFINE_MUTEX(sysctl_mutex); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sysctl_mutex); + mode = vmemmap_optimize_mode; + table->data = &mode; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (write && !ret) + vmemmap_optimize_mode_switch(mode); + mutex_unlock(&sysctl_mutex); + + return ret; +} + +static struct ctl_table hugetlb_vmemmap_sysctls[] = { + { + .procname = "hugetlb_optimize_vmemmap", + .maxlen = sizeof(enum vmemmap_optimize_mode), + .mode = 0644, + .proc_handler = hugetlb_optimize_vmemmap_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int hugetlb_vmemmap_sysctls_init(void) +{ + /* + * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" + * crosses page boundaries, the vmemmap pages cannot be optimized. + */ + if (is_power_of_2(sizeof(struct page))) + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + + return 0; +} +late_initcall(hugetlb_vmemmap_sysctls_init); +#endif /* CONFIG_PROC_SYSCTL */ -- Gitee From 1d2bbc77596574b078cd2b8dec8725d8c38617ea Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Jul 2022 18:07:01 +0800 Subject: [PATCH 128/132] mm: hugetlb_vmemmap: fix CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON mainline inclusion from mainline-v5.19-rc1 commit 0111def915b280c64c05f73f01b59ca404255aa3 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0111def915b280c64c05f73f01b59ca404255aa3 -------------------------------- The following: commit 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*") forgot to update CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON used in vmemmap_optimize_mode to CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON. The result is we cannot enable hugetlb_optimize_vmemmap at boot time when we configure CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON. Fix it. Link: https://lkml.kernel.org/r/20220527081948.68832-1-songmuchun@bytedance.com Fixes: 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*") Signed-off-by: Muchun Song Reported-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4340bf6c5551..e9f63cb9e3d4 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -198,7 +198,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static enum vmemmap_optimize_mode vmemmap_optimize_mode = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) { -- Gitee From f1cfb6175ca81adf5ad5448ea4ad8cd205acf96b Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 28 Jul 2022 18:07:02 +0800 Subject: [PATCH 129/132] mm: hugetlb_vmemmap: disable hugetlb_vmemmap when dynamic hugetlb is enabled hulk inclusion category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA -------------------------------- Disable hugetlb_vmemmap when dynamic hugetlb is enabled. By the way, fix a similar spelling error. Signed-off-by: Liu Shixin Reviewed-by: Kefeng Wang Signed-off-by: Zheng Zengkai --- mm/dynamic_hugetlb.c | 4 ++-- mm/huge_memory.c | 2 +- mm/hugetlb_vmemmap.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index f8ebc8ab7d60..c1b968a7e668 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -1150,6 +1150,6 @@ static int __init dynamic_hugetlb_setup(char *s) { if (!strcmp(s, "on")) enable_dhugetlb = true; - return 1; + return 0; } -__setup("dynamic_hugetlb=", dynamic_hugetlb_setup); +early_param("dynamic_hugetlb", dynamic_hugetlb_setup); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bfe079e294cb..79c855b5adad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -401,7 +401,7 @@ static int __init hugepage_init(void) */ if (enable_dhugetlb) { transparent_hugepage_flags = 0; - pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + pr_info("transparent hugepage is disabled due to conflict with dynamic hugetlb\n"); return -EINVAL; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index e9f63cb9e3d4..7ec8560d267d 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -176,6 +176,7 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt +#include #include "hugetlb_vmemmap.h" /* @@ -304,6 +305,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); + if (enable_dhugetlb) { + pr_warn_once("cannot optimize vmemmap pages due to conflict with dynamic hugetlb\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); static_branch_disable(&hugetlb_optimize_vmemmap_key); @@ -366,8 +373,10 @@ static __init int hugetlb_vmemmap_sysctls_init(void) /* * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" * crosses page boundaries, the vmemmap pages cannot be optimized. + * If "dynamic hugetlb" is enabled, the vmemmap pages cannot be + * optimized. */ - if (is_power_of_2(sizeof(struct page))) + if (is_power_of_2(sizeof(struct page)) && !enable_dhugetlb) register_sysctl_init("vm", hugetlb_vmemmap_sysctls); return 0; -- Gitee From 3513c1a7443124efaa0bdf2464369288ea139efd Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Tue, 26 Apr 2022 22:53:28 +0800 Subject: [PATCH 130/132] watchdog: wdat_wdt: Using the existing function to check parameter timeout mainline inclusion from mainline-v5.19-rc1 commit 6d72c7ac9fbe26a77800676507da980436b40b2f category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596BF Intel-SIG: commit 6d72c7ac9fbe watchdog: wdat_wdt: Using the existing function to check parameter timeout ------------------------------------- If max_hw_heartbeat_ms is provided, the configured maximum timeout is not limited by it. The limit check in this driver therefore doesn't make much sense. Similar, the watchdog core ensures that minimum timeout limits are met if min_hw_heartbeat_ms is set. Using watchdog_timeout_invalid() makes more sense because it takes this into account. Signed-off-by: Liu Xinpeng Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1650984810-6247-2-git-send-email-liuxp11@chinatelecom.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Yunying Sun --- drivers/watchdog/wdat_wdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index 3065dd670a18..d1f4dc191fbb 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -344,6 +344,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) wdat->period = tbl->timer_period; wdat->wdd.min_hw_heartbeat_ms = wdat->period * tbl->min_count; wdat->wdd.max_hw_heartbeat_ms = wdat->period * tbl->max_count; + wdat->wdd.min_timeout = 1; wdat->stopped_in_sleep = tbl->flags & ACPI_WDAT_STOPPED; wdat->wdd.info = &wdat_wdt_info; wdat->wdd.ops = &wdat_wdt_ops; @@ -450,8 +451,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) * watchdog properly after it has opened the device. In some cases * the BIOS default is too short and causes immediate reboot. */ - if (timeout * 1000 < wdat->wdd.min_hw_heartbeat_ms || - timeout * 1000 > wdat->wdd.max_hw_heartbeat_ms) { + if (watchdog_timeout_invalid(&wdat->wdd, timeout)) { dev_warn(dev, "Invalid timeout %d given, using %d\n", timeout, WDAT_DEFAULT_TIMEOUT); timeout = WDAT_DEFAULT_TIMEOUT; -- Gitee From 3c71105b0bc6268f9cc72187dc04ac7519d2d2e1 Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Tue, 26 Apr 2022 22:53:29 +0800 Subject: [PATCH 131/132] watchdog: wdat_wdt: Stop watchdog when rebooting the system mainline inclusion from mainline-v5.19-rc1 commit 27fdf84510a1374748904db43f6755f912736d92 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596BF Intel-SIG: commit 27fdf84510a1 watchdog: wdat_wdt: Stop watchdog when rebooting the system ------------------------------------- Executing reboot command several times on the machine "Dell PowerEdge R740", UEFI security detection stopped machine with the following prompt: UEFI0082: The system was reset due to a timeout from the watchdog timer. Check the System Event Log (SEL) or crash dumps from Operating Sysstem to identify the source that triggered the watchdog timer reset. Update the firmware or driver for the identified device. iDRAC has warning event: "The watchdog timer reset the system". This patch fixes this issue by adding the reboot notifier. Signed-off-by: Liu Xinpeng Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1650984810-6247-3-git-send-email-liuxp11@chinatelecom.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Yunying Sun --- drivers/watchdog/wdat_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index d1f4dc191fbb..1f0231699c17 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -462,6 +462,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) return ret; watchdog_set_nowayout(&wdat->wdd, nowayout); + watchdog_stop_on_reboot(&wdat->wdd); return devm_watchdog_register_device(dev, &wdat->wdd); } -- Gitee From 190fb72a92b47a89bb2066a414a0cecfab9d5348 Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Tue, 26 Apr 2022 22:53:30 +0800 Subject: [PATCH 132/132] watchdog: wdat_wdt: Stop watchdog when uninstalling module mainline inclusion from mainline-v5.19-rc1 commit 330415ebea81b65842e4cc6d2fd985c1b369e650 category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/I596BF Intel-SIG: commit 330415ebea81 watchdog: wdat_wdt: Stop watchdog when uninstalling module ------------------------------------- Test shows that wachdog still reboots machine after the module is removed. Use watchdog_stop_on_unregister to stop the watchdog on removing. Signed-off-by: Liu Xinpeng eviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1650984810-6247-4-git-send-email-liuxp11@chinatelecom.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Yunying Sun --- drivers/watchdog/wdat_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index 1f0231699c17..1bd4c8d89ed4 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -463,6 +463,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) watchdog_set_nowayout(&wdat->wdd, nowayout); watchdog_stop_on_reboot(&wdat->wdd); + watchdog_stop_on_unregister(&wdat->wdd); return devm_watchdog_register_device(dev, &wdat->wdd); } -- Gitee