diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b04cf8fbab4989c9cedbfa0585436e684fb4c78..247acf7fc837f0a3c752782070c4421eac7336ac 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1593,14 +1593,17 @@ Format: size[KMG] hugetlb_free_vmemmap= - [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP + [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more - memory (6 * PAGE_SIZE for each 2MB hugetlb page). - Format: { on | off (default) } + memory (7 * PAGE_SIZE for each 2MB hugetlb page). + Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) } - on: enable the feature - off: disable the feature + [oO][Nn]/Y/y/1: enable the feature + [oO][Ff]/N/n/0: disable the feature + + Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, + the default is on. hugevmalloc [KNL,PPC,ARM64,X86] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC Format: { on | off } diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index d70828c0765802d3cd411414c30f4f7fd659db44..0f8acc4a6cf0cc1c562520a5aac651ecb7a0a8ef 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -164,7 +164,7 @@ default_hugepagesz will all result in 256 2M huge pages being allocated. Valid default huge page size is architecture dependent. hugetlb_free_vmemmap - When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing + When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing unused vmemmap pages associated with each HugeTLB page. When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index a5fbef4740c2b24e5f18b42a84aa6206e965790f..5de629b932ae3422bf1c3d3295c8af93d377a120 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -560,6 +560,45 @@ Change the minimum size of the hugepage pool. See Documentation/admin-guide/mm/hugetlbpage.rst +hugetlb_optimize_vmemmap +======================== + +This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter) +is configured or the size of 'struct page' (a structure defined in +include/linux/mm_types.h) is not power of two (an unusual system config could +result in this). + +Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages +associated with each HugeTLB page. + +Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages +per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be +optimized. When those optimized HugeTLB pages are freed from the HugeTLB pool +to the buddy allocator, the vmemmap pages representing that range needs to be +remapped again and the vmemmap pages discarded earlier need to be rellocated +again. If your use case is that HugeTLB pages are allocated 'on the fly' (e.g. +never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set +'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on +the fly') instead of being pulled from the HugeTLB pool, you should weigh the +benefits of memory savings against the more overhead (~2x slower than before) +of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy +allocator. Another behavior to note is that if the system is under heavy memory +pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB +pool to the buddy allocator since the allocation of vmemmap pages could be +failed, you have to retry later if your system encounter this situation. + +Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will not be optimized meaning the extra overhead at allocation +time from buddy allocator disappears, whereas already optimized HugeTLB pages +will not be affected. If you want to make sure there are no optimized HugeTLB +pages, you can set "nr_hugepages" to 0 first and then disable this. Note that +writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus +pages. So, those surplus pages are still optimized until they are no longer +in use. You would need to wait for those surplus pages to be released before +there are no optimized pages in the system. + + nr_hugepages_mempolicy ====================== diff --git a/Documentation/filesystems/ext4/attributes.rst b/Documentation/filesystems/ext4/attributes.rst index 54386a010a8d7003a36d8792330715316f7f3129..871d2da7a0a91e73f0f791f5627a57855741f20a 100644 --- a/Documentation/filesystems/ext4/attributes.rst +++ b/Documentation/filesystems/ext4/attributes.rst @@ -76,7 +76,7 @@ The beginning of an extended attribute block is in - Checksum of the extended attribute block. * - 0x14 - \_\_u32 - - h\_reserved[2] + - h\_reserved[3] - Zero. The checksum is calculated against the FS UUID, the 64-bit block number diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index ae656bfc31c3d38adc31bfa92b3630bcb6b02bff..301ade4d0b94391757f848484977af56175a8813 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -199,6 +199,7 @@ tracesys_exit: st r0, [sp, PT_r0] ; sys call return value in pt_regs ;POST Sys Call Ptrace Hook + mov r0, sp ; pt_regs needed bl @syscall_trace_exit b ret_from_exception ; NOT ret_from_system_call at is saves r0 which ; we'd done before calling post hook above diff --git a/arch/arm/mach-vexpress/spc.c b/arch/arm/mach-vexpress/spc.c index 1da11bdb1dfbd6f1ce906b83ba2d84ecd396f316..1c6500c4e6a1768c0bde6fa36b892dab39ac2498 100644 --- a/arch/arm/mach-vexpress/spc.c +++ b/arch/arm/mach-vexpress/spc.c @@ -580,7 +580,7 @@ static int __init ve_spc_clk_init(void) } cluster = topology_physical_package_id(cpu_dev->id); - if (init_opp_table[cluster]) + if (cluster < 0 || init_opp_table[cluster]) continue; if (ve_init_opp_table(cpu_dev)) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e253fdba1249cc196ecb2a1b0413fced0f86762a..c4f6c80ea97694b6241996711d0449deb0b42388 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,6 +81,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi index 49082529764f0670e1be3ef1d4e3517cc5d57dbc..0fac1f3f7f47807431cdf39b8586c0e5fad77962 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-var-som.dtsi @@ -89,12 +89,12 @@ touchscreen@0 { pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <125>; - touchscreen-size-x = /bits/ 16 <4008>; + touchscreen-size-x = <4008>; ti,y-min = /bits/ 16 <282>; - touchscreen-size-y = /bits/ 16 <3864>; + touchscreen-size-y = <3864>; ti,x-plate-ohms = /bits/ 16 <180>; - touchscreen-max-pressure = /bits/ 16 <255>; - touchscreen-average-samples = /bits/ 16 <10>; + touchscreen-max-pressure = <255>; + touchscreen-average-samples = <10>; ti,debounce-tol = /bits/ 16 <3>; ti,debounce-rep = /bits/ 16 <1>; ti,settle-delay-usec = /bits/ 16 <150>; diff --git a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi index 7f356edf9f916254584ef49f3af0446aae458399..f6287f174355cfd16bcc9a3434ae23ec5f624c21 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi @@ -70,12 +70,12 @@ touchscreen@0 { pendown-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>; ti,x-min = /bits/ 16 <125>; - touchscreen-size-x = /bits/ 16 <4008>; + touchscreen-size-x = <4008>; ti,y-min = /bits/ 16 <282>; - touchscreen-size-y = /bits/ 16 <3864>; + touchscreen-size-y = <3864>; ti,x-plate-ohms = /bits/ 16 <180>; - touchscreen-max-pressure = /bits/ 16 <255>; - touchscreen-average-samples = /bits/ 16 <10>; + touchscreen-max-pressure = <255>; + touchscreen-average-samples = <10>; ti,debounce-tol = /bits/ 16 <3>; ti,debounce-rep = /bits/ 16 <1>; ti,settle-delay-usec = /bits/ 16 <150>; diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index d246fd508ef65104a038ce72758f562989965a44..593ac67497e36d0291dec4e863eea81cf405492b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6281,8 +6281,8 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index daed33697d986dd5ef79e67b367e7f6e24fd65d5..ab2443900f4eaee19a9b216ab09ecdf09f58d1ac 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -516,13 +516,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_none(pmd) (!pmd_val(pmd)) -#define pmd_bad(pmd) (!(pmd_val(pmd) & PMD_TABLE_BIT)) - #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) -#define pmd_leaf(pmd) pmd_sect(pmd) +#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd)) +#define pmd_bad(pmd) (!pmd_table(pmd)) #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 static inline bool pud_sect(pud_t pud) { return false; } @@ -606,9 +605,9 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) +#define pud_bad(pud) (!pud_table(pud)) #define pud_present(pud) pte_present(pud_pte(pud)) -#define pud_leaf(pud) pud_sect(pud) +#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) #define pud_valid(pud) pte_valid(pud_pte(pud)) static inline void set_pud(pud_t *pudp, pud_t pud) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 145fe60c5e68467adffbe3983bbff7a31c81e6ef..c7678e7df53aa6d739575b0ade8add9794a34851 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_page(struct page *page) { + /* + * Only the head page's flags of HugeTLB can be cleared since the tail + * vmemmap pages associated with each HugeTLB page are mapped with + * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more + * details can refer to vmemmap_remap_pte()). Although + * __sync_icache_dcache() only set PG_dcache_clean flag on the head + * page struct, there is more than one page struct with PG_dcache_clean + * associated with the HugeTLB page since the head vmemmap page frame + * is reused (more details can refer to the comments above + * page_fixed_fake_head()). + */ + if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page)) + page = compound_head(page); + if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8da93fdfa59e90edbee279ffcb1b5760c972ac35..c640053ab03f203149c3c272d43f777d5b2cb37f 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -421,13 +421,19 @@ static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, tbl[idx % TCES_PER_PAGE] = tce; } -static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, - unsigned long entry) +static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt, + struct iommu_table *tbl, unsigned long entry) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; + unsigned long i; + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); + + for (i = 0; i < subpages; ++i) { + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir); + } } static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, @@ -486,6 +492,8 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm, break; } + iommu_tce_kill(tbl, io_entry, subpages); + return ret; } @@ -545,6 +553,8 @@ static long kvmppc_tce_iommu_map(struct kvm *kvm, break; } + iommu_tce_kill(tbl, io_entry, subpages); + return ret; } @@ -591,10 +601,9 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - iommu_tce_kill(stit->tbl, entry, 1); if (ret != H_SUCCESS) { - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry); goto unlock_exit; } } @@ -670,13 +679,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ if (get_user(tce, tces + i)) { ret = H_TOO_HARD; - goto invalidate_exit; + goto unlock_exit; } tce = be64_to_cpu(tce); if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto invalidate_exit; + goto unlock_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -685,19 +694,15 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, - entry); - goto invalidate_exit; + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, + entry + i); + goto unlock_exit; } } kvmppc_tce_put(stt, entry + i, tce); } -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill(stit->tbl, entry, npages); - unlock_exit: srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -736,20 +741,16 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - goto invalidate_exit; + return ret; WARN_ON_ONCE(1); - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i); } } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages); - return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index e5ba96c41f3fc6ca05893e03415b4cfca73e5102..57af53a6a2d8470dcb2bc64624426c3d76f1af47 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -247,13 +247,19 @@ static void iommu_tce_kill_rm(struct iommu_table *tbl, tbl->it_ops->tce_kill(tbl, entry, pages, true); } -static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, - unsigned long entry) +static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt, + struct iommu_table *tbl, unsigned long entry) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; + unsigned long i; + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); + + for (i = 0; i < subpages; ++i) { + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir); + } } static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, @@ -316,6 +322,8 @@ static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, break; } + iommu_tce_kill_rm(tbl, io_entry, subpages); + return ret; } @@ -379,6 +387,8 @@ static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, break; } + iommu_tce_kill_rm(tbl, io_entry, subpages); + return ret; } @@ -424,10 +434,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - iommu_tce_kill_rm(stit->tbl, entry, 1); - if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry); return ret; } } @@ -569,7 +577,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ua = 0; if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto invalidate_exit; + goto unlock_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -578,19 +586,15 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, - entry); - goto invalidate_exit; + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, + entry + i); + goto unlock_exit; } } kvmppc_rm_tce_put(stt, entry + i, tce); } -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill_rm(stit->tbl, entry, npages); - unlock_exit: if (!prereg) arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); @@ -632,20 +636,16 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - goto invalidate_exit; + return ret; WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i); } } for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); -invalidate_exit: - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) - iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages); - return ret; } diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 2a57e93a79dcf703b920f1b556d46d60308507e4..7245355bee28bd4bd1fa56195f948956c0533e57 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -133,11 +133,11 @@ int p9_dd22_bl_ev[] = { /* Table of alternatives, sorted by column 0 */ static const unsigned int power9_event_alternatives[][MAX_ALT] = { - { PM_INST_DISP, PM_INST_DISP_ALT }, - { PM_RUN_CYC_ALT, PM_RUN_CYC }, - { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, - { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, { PM_BR_2PATH, PM_BR_2PATH_ALT }, + { PM_INST_DISP, PM_INST_DISP_ALT }, + { PM_RUN_CYC_ALT, PM_RUN_CYC }, + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, + { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, }; static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[]) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d17396ef4323d97d584c720eaa0333319065ae92..f39cfb5a6535b30d792192ce0200af3ae55dd408 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,6 +103,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_TABLE_SORT diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 3eac70518e6f43e638575840a9a30fd1ecba0cf1..f013c7b9588124341fea5f73286e9df569971e4f 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7370,8 +7370,8 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_DYNAMIC_HUGETLB=y CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 0e327a01f50fbbff551727d38ee51fe351011254..46a067bd7e0bada795422c5f22ffb96d1e0f31a1 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -29,15 +29,13 @@ typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { - compat_dev_t st_dev; - u16 __pad1; + u32 st_dev; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; - compat_dev_t st_rdev; - u16 __pad2; + u32 st_rdev; u32 st_size; u32 st_blksize; u32 st_blocks; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ed49a4abd20ec7c1b025da10d7c692fc64b02238..ac7c45889947be25bca231eff0f37a1e9f750fe9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1271,6 +1271,7 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + int ret; p->mce_count = 0; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); @@ -1278,22 +1279,36 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && - !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + if (!ret) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; } - if (p->mce_vaddr != (void __user *)-1l) { - force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); - } else { - pr_err("Memory error not recovered"); - kill_me_now(cb); - } + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, so no need to + * send SIGBUS here again. + */ + if (ret == -EHWPOISON) + return; + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + +static void kill_me_never(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + + p->mce_count = 0; + pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); } -static void queue_task_work(struct mce *m, char *msg, int kill_current_task) +static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; @@ -1303,11 +1318,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task) current->mce_kflags = m->kflags; current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); current->mce_whole_page = whole_page(m); - - if (kill_current_task) - current->mce_kill_me.func = kill_me_now; - else - current->mce_kill_me.func = kill_me_maybe; + current->mce_kill_me.func = func; } /* Ten is likely overkill. Don't expect more than two faults before task_work() */ @@ -1477,8 +1488,10 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, msg, kill_it); - + if (kill_it) + queue_task_work(&m, msg, kill_me_now); + else + queue_task_work(&m, msg, kill_me_maybe); } else { /* * Handle an MCE which has happened in kernel space but from @@ -1495,7 +1508,7 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_it); + queue_task_work(&m, msg, kill_me_never); } instrumentation_end(); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 77b9b2a3b5c84dbf8418d7816e84412297fbc12b..403ba3eb4b8443071a30b1ce32f2fbe04412e722 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -225,6 +225,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * Don't try to copy the tail if machine check happened * * Input: + * eax trap number written by ex_handler_copy() * rdi destination * rsi source * rdx count @@ -233,24 +234,19 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * eax uncopied bytes or 0 if successful. */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) - movl %edx,%ecx - cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + cmp $X86_TRAP_MC,%eax je 3f + + movl %edx,%ecx 1: rep movsb 2: mov %ecx,%eax ASM_CLAC ret - /* - * Return zero to pretend that this copy succeeded. This - * is counter-intuitive, but needed to prevent the code - * in lib/iov_iter.c from retrying and running back into - * the poison cache line again. The machine check handler - * will ensure that a SIGBUS is sent to the task. - */ -3: xorl %eax,%eax +3: + movl %edx,%eax ASM_CLAC - ret + RET _ASM_EXTABLE_CPY(1b, 2b) SYM_CODE_END(.Lcopy_user_handle_tail) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1eed5eaee41fb6c2c2d2c57badb4e42f5f2c5dd4..b1d5c05aeca8dfa82886e0c6bd29b68fde0c73f6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1225,7 +1225,7 @@ static struct kcore_list kcore_vsyscall; static void __init register_page_bootmem_info(void) { -#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) int i; for_each_online_node(i) diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index 45cc0ae0af6f966ed778253b9624d46a783c2152..c7b9f12896f20a6870588cd50e9849e11c992dcf 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -29,7 +29,7 @@ .if XTENSA_HAVE_COPROCESSOR(x); \ .align 4; \ .Lsave_cp_regs_cp##x: \ - xchal_cp##x##_store a2 a4 a5 a6 a7; \ + xchal_cp##x##_store a2 a3 a4 a5 a6; \ jx a0; \ .endif @@ -46,7 +46,7 @@ .if XTENSA_HAVE_COPROCESSOR(x); \ .align 4; \ .Lload_cp_regs_cp##x: \ - xchal_cp##x##_load a2 a4 a5 a6 a7; \ + xchal_cp##x##_load a2 a3 a4 a5 a6; \ jx a0; \ .endif diff --git a/arch/xtensa/kernel/jump_label.c b/arch/xtensa/kernel/jump_label.c index 0dde21e0d3de4c2836bbce5c7fee361811863ec8..ad1841cecdfb769a32c7ed61fa62d027cde4bbcf 100644 --- a/arch/xtensa/kernel/jump_label.c +++ b/arch/xtensa/kernel/jump_label.c @@ -40,7 +40,7 @@ static int patch_text_stop_machine(void *data) { struct patch *patch = data; - if (atomic_inc_return(&patch->cpu_count) == 1) { + if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { local_patch_text(patch->addr, patch->data, patch->sz); atomic_inc(&patch->cpu_count); } else { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0a4fcbda8ab45b9f235995b52b1ef5271f6a0112..548d758365c632852a2c9f2ba630b5fba4aa2bf6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -821,6 +821,38 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + +static void disk_init_partition(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct disk_part_iter piter; + struct hd_struct *part; + + disk_scan_partitions(disk); + + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -910,9 +942,23 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); + + /* + * Set the flag at last, so that block devcie can't be opened + * before it's registration is done. + */ + disk->flags |= GENHD_FL_UP; ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); + /* + * Init partitions after releasing 'sysfs_dir_lock', otherwise lockdep + * will be confused because it will treat 'bd_mutex' from different + * devices as the same lock. + */ + if (!ret) + disk_init_partition(disk); + return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); diff --git a/block/genhd.c b/block/genhd.c index 8b37fcfa10d18995c83f14810f479c1cf77f31f0..021c9c2d7231ae1fad4f656be0c48bac7b14d675 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -687,25 +687,10 @@ static int exact_lock(dev_t devt, void *data) return 0; } -static void disk_scan_partitions(struct gendisk *disk) -{ - struct block_device *bdev; - - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; - - set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); -} - static void register_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct hd_struct *part; int err; ddev->parent = parent; @@ -743,18 +728,6 @@ static void register_disk(struct device *parent, struct gendisk *disk, if (disk->flags & GENHD_FL_HIDDEN) return; - disk_scan_partitions(disk); - - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); - if (disk->queue->backing_dev_info->dev) { err = sysfs_create_link(&ddev->kobj, &disk->queue->backing_dev_info->dev->kobj, @@ -799,8 +772,6 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, WARN_ON(!disk->minors && !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); - disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(&disk->part0, &devt); if (retval) { WARN_ON(1); diff --git a/block/ioctl.c b/block/ioctl.c index 8171858dc8a9d8bcd2d4740a1e6da082309e90ae..7c578f84a4fd9577d3b55e10b61190d9b61f5ca9 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -683,7 +683,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if ((size >> 9) > ~(compat_ulong_t)0) return -EFBIG; return compat_put_ulong(argp, size >> 9); diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c index b066809ba9a110c30f3ac14d497f374e9b588ecb..c56f4043b0cc0f08951fb34ca3d216b3ecd46590 100644 --- a/drivers/ata/pata_marvell.c +++ b/drivers/ata/pata_marvell.c @@ -83,6 +83,8 @@ static int marvell_cable_detect(struct ata_port *ap) switch(ap->port_no) { case 0: + if (!ap->ioaddr.bmdma_addr) + return ATA_CBL_PATA_UNK; if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1) return ATA_CBL_PATA40; return ATA_CBL_PATA80; diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 90afba0b36fe97514b661d8489be85b20b95966e..47552db6b8dc3699b68f249f5f835e71bcd32258 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1390,7 +1390,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, { struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); struct at_xdmac *atxdmac = to_at_xdmac(atchan->chan.device); - struct at_xdmac_desc *desc, *_desc; + struct at_xdmac_desc *desc, *_desc, *iter; struct list_head *descs_list; enum dma_status ret; int residue, retry; @@ -1505,11 +1505,13 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, * microblock. */ descs_list = &desc->descs_list; - list_for_each_entry_safe(desc, _desc, descs_list, desc_node) { - dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg); - residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth; - if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda) + list_for_each_entry_safe(iter, _desc, descs_list, desc_node) { + dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg); + residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth; + if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) { + desc = iter; break; + } } residue += cur_ubc << dwidth; diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 7b41cdff1a2ce56c0d4a68286c5954e4257edfdb..51af0dfc3c63e01f842d1e741f15ef2834553c50 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1098,6 +1098,9 @@ static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attr u64 xfer_size; int rc; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + if (wq->state != IDXD_WQ_DISABLED) return -EPERM; @@ -1132,6 +1135,9 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu u64 batch_size; int rc; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + if (wq->state != IDXD_WQ_DISABLED) return -EPERM; diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 306f93e4b26a8875a194cc20a112c369b0bc5366..792c91cd16080b661b763f921eb5f3df08d6bdbf 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -1789,7 +1789,7 @@ static int sdma_event_remap(struct sdma_engine *sdma) u32 reg, val, shift, num_map, i; int ret = 0; - if (IS_ERR(np) || IS_ERR(gpr_np)) + if (IS_ERR(np) || !gpr_np) goto out; event_remap = of_find_property(np, propname, NULL); @@ -1837,7 +1837,7 @@ static int sdma_event_remap(struct sdma_engine *sdma) } out: - if (!IS_ERR(gpr_np)) + if (gpr_np) of_node_put(gpr_np); return ret; diff --git a/drivers/dma/mediatek/mtk-uart-apdma.c b/drivers/dma/mediatek/mtk-uart-apdma.c index 375e7e647df6b5093b156c2cd88e7f7f0e6798d3..a1517ef1f4a0185700343797ef05d8ef6810ed0a 100644 --- a/drivers/dma/mediatek/mtk-uart-apdma.c +++ b/drivers/dma/mediatek/mtk-uart-apdma.c @@ -274,7 +274,7 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan) unsigned int status; int ret; - ret = pm_runtime_get_sync(mtkd->ddev.dev); + ret = pm_runtime_resume_and_get(mtkd->ddev.dev); if (ret < 0) { pm_runtime_put_noidle(chan->device->dev); return ret; @@ -288,18 +288,21 @@ static int mtk_uart_apdma_alloc_chan_resources(struct dma_chan *chan) ret = readx_poll_timeout(readl, c->base + VFF_EN, status, !status, 10, 100); if (ret) - return ret; + goto err_pm; ret = request_irq(c->irq, mtk_uart_apdma_irq_handler, IRQF_TRIGGER_NONE, KBUILD_MODNAME, chan); if (ret < 0) { dev_err(chan->device->dev, "Can't request dma IRQ\n"); - return -EINVAL; + ret = -EINVAL; + goto err_pm; } if (mtkd->support_33bits) mtk_uart_apdma_write(c, VFF_4G_SUPPORT, VFF_4G_SUPPORT_CLR_B); +err_pm: + pm_runtime_put_noidle(mtkd->ddev.dev); return ret; } diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index f4eb071327be08163e9463d9abbbb3ac1885f9c1..bf4297075c229a5fbb572f4e64befc2c02da253f 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -161,7 +161,9 @@ const char * const edac_mem_types[] = { [MEM_DDR4] = "Unbuffered-DDR4", [MEM_RDDR4] = "Registered-DDR4", [MEM_LRDDR4] = "Load-Reduced-DDR4-RAM", + [MEM_DDR5] = "Unbuffered-DDR5", [MEM_NVDIMM] = "Non-volatile-RAM", + [MEM_HBM2] = "High-bandwidth-memory-Gen2", }; EXPORT_SYMBOL_GPL(edac_mem_types); diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 3a7362f968c9f60d4682911d043fa4b3e7ed6633..d63ddc9c994dca1bdd678fd6be02e78fb6e03192 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -13,7 +13,7 @@ #include "edac_module.h" #include "skx_common.h" -#define I10NM_REVISION "v0.0.3" +#define I10NM_REVISION "v0.0.5" #define EDAC_MOD_STR "i10nm_edac" /* Debug macros */ @@ -24,20 +24,165 @@ pci_read_config_dword((d)->uracu, 0xd0, &(reg)) #define I10NM_GET_IMC_BAR(d, i, reg) \ pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) +#define I10NM_GET_SAD(d, offset, i, reg)\ + pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg)) +#define I10NM_GET_HBM_IMC_BAR(d, reg) \ + pci_read_config_dword((d)->uracu, 0xd4, &(reg)) +#define I10NM_GET_CAPID3_CFG(d, reg) \ + pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ - readl((m)->mbase + 0x2080c + (i) * 0x4000 + (j) * 4) + readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \ + (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ - readl((m)->mbase + 0x20970 + (i) * 0x4000) + readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \ + (i) * (m)->chan_mmio_sz) #define I10NM_GET_MCMTR(m, i) \ - readl((m)->mbase + 0x20ef8 + (i) * 0x4000) + readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \ + (i) * (m)->chan_mmio_sz) +#define I10NM_GET_AMAP(m, i) \ + readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \ + (i) * (m)->chan_mmio_sz) +#define I10NM_GET_REG32(m, i, offset) \ + readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) +#define I10NM_GET_REG64(m, i, offset) \ + readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) +#define I10NM_SET_REG32(m, i, offset, v) \ + writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) #define I10NM_GET_SCK_MMIO_BASE(reg) (GET_BITFIELD(reg, 0, 28) << 23) #define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12) #define I10NM_GET_IMC_MMIO_SIZE(reg) ((GET_BITFIELD(reg, 13, 23) - \ GET_BITFIELD(reg, 0, 10) + 1) << 12) +#define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg) \ + ((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000) + +#define I10NM_HBM_IMC_MMIO_SIZE 0x9000 +#define I10NM_IS_HBM_PRESENT(reg) GET_BITFIELD(reg, 27, 30) +#define I10NM_IS_HBM_IMC(reg) GET_BITFIELD(reg, 29, 29) + +#define RETRY_RD_ERR_LOG_UC BIT(1) +#define RETRY_RD_ERR_LOG_NOOVER BIT(14) +#define RETRY_RD_ERR_LOG_EN BIT(15) +#define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1)) +#define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0)) + +#define I10NM_MAX_SAD 16 +#define I10NM_SAD_ENABLE(reg) GET_BITFIELD(reg, 0, 0) +#define I10NM_SAD_NM_CACHEABLE(reg) GET_BITFIELD(reg, 5, 5) static struct list_head *i10nm_edac_list; +static struct res_config *res_cfg; +static int retry_rd_err_log; + +static u32 offsets_scrub_icx[] = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_scrub_spr[] = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}; +static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}; +static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0}; + +static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable) +{ + u32 s, d; + + if (!imc->mbase) + return; + + s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]); + d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]); + + if (enable) { + /* Save default configurations */ + imc->chan[chan].retry_rd_err_log_s = s; + imc->chan[chan].retry_rd_err_log_d = d; + + s &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + s |= RETRY_RD_ERR_LOG_EN; + d &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + d |= RETRY_RD_ERR_LOG_EN; + } else { + /* Restore default configurations */ + if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC) + s |= RETRY_RD_ERR_LOG_UC; + if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER) + s |= RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN)) + s &= ~RETRY_RD_ERR_LOG_EN; + if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC) + d |= RETRY_RD_ERR_LOG_UC; + if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER) + d |= RETRY_RD_ERR_LOG_NOOVER; + if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN)) + d &= ~RETRY_RD_ERR_LOG_EN; + } + + I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s); + I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d); +} + +static void enable_retry_rd_err_log(bool enable) +{ + struct skx_dev *d; + int i, j; + + edac_dbg(2, "\n"); + + list_for_each_entry(d, i10nm_edac_list, list) + for (i = 0; i < I10NM_NUM_IMC; i++) + for (j = 0; j < I10NM_NUM_CHANNELS; j++) + __enable_retry_rd_err_log(&d->imc[i], j, enable); +} + +static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, + int len, bool scrub_err) +{ + struct skx_imc *imc = &res->dev->imc[res->imc]; + u32 log0, log1, log2, log3, log4; + u32 corr0, corr1, corr2, corr3; + u64 log2a, log5; + u32 *offsets; + int n; + + if (!imc->mbase) + return; + + offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand; + + log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]); + log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]); + log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]); + log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]); + log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]); + + if (res_cfg->type == SPR) { + log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]); + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]", + log0, log1, log2a, log3, log4, log5); + } else { + log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]); + n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]", + log0, log1, log2, log3, log4, log5); + } + + corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18); + corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c); + corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20); + corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24); + + if (len - n > 0) + snprintf(msg + n, len - n, + " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]", + corr0 & 0xffff, corr0 >> 16, + corr1 & 0xffff, corr1 >> 16, + corr2 & 0xffff, corr2 >> 16, + corr3 & 0xffff, corr3 >> 16); + + /* Clear status bits */ + if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) { + log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; + I10NM_SET_REG32(imc, res->channel, offsets[0], log0); + } +} + static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, unsigned int dev, unsigned int fun) { @@ -61,7 +206,32 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, return pdev; } -static int i10nm_get_all_munits(void) +static bool i10nm_check_2lm(struct res_config *cfg) +{ + struct skx_dev *d; + u32 reg; + int i; + + list_for_each_entry(d, i10nm_edac_list, list) { + d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1], + PCI_SLOT(cfg->sad_all_devfn), + PCI_FUNC(cfg->sad_all_devfn)); + if (!d->sad_all) + continue; + + for (i = 0; i < I10NM_MAX_SAD; i++) { + I10NM_GET_SAD(d, cfg->sad_all_offset, i, reg); + if (I10NM_SAD_ENABLE(reg) && I10NM_SAD_NM_CACHEABLE(reg)) { + edac_dbg(2, "2-level memory configuration.\n"); + return true; + } + } + } + + return false; +} + +static int i10nm_get_ddr_munits(void) { struct pci_dev *mdev; void __iomem *mbase; @@ -89,7 +259,7 @@ static int i10nm_get_all_munits(void) edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n", j++, base, reg); - for (i = 0; i < I10NM_NUM_IMC; i++) { + for (i = 0; i < I10NM_NUM_DDR_IMC; i++) { mdev = pci_get_dev_wrapper(d->seg, d->bus[0], 12 + i, 0); if (i == 0 && !mdev) { @@ -125,16 +295,132 @@ static int i10nm_get_all_munits(void) return 0; } +static bool i10nm_check_hbm_imc(struct skx_dev *d) +{ + u32 reg; + + if (I10NM_GET_CAPID3_CFG(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get capid3_cfg\n"); + return false; + } + + return I10NM_IS_HBM_PRESENT(reg) != 0; +} + +static int i10nm_get_hbm_munits(void) +{ + struct pci_dev *mdev; + void __iomem *mbase; + u32 reg, off, mcmtr; + struct skx_dev *d; + int i, lmc; + u64 base; + + list_for_each_entry(d, i10nm_edac_list, list) { + d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3); + if (!d->pcu_cr3) + return -ENODEV; + + if (!i10nm_check_hbm_imc(d)) { + i10nm_printk(KERN_DEBUG, "No hbm memory\n"); + return -ENODEV; + } + + if (I10NM_GET_SCK_BAR(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get socket bar\n"); + return -ENODEV; + } + base = I10NM_GET_SCK_MMIO_BASE(reg); + + if (I10NM_GET_HBM_IMC_BAR(d, reg)) { + i10nm_printk(KERN_ERR, "Failed to get hbm mc bar\n"); + return -ENODEV; + } + base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg); + + lmc = I10NM_NUM_DDR_IMC; + + for (i = 0; i < I10NM_NUM_HBM_IMC; i++) { + mdev = pci_get_dev_wrapper(d->seg, d->bus[0], + 12 + i / 4, 1 + i % 4); + if (i == 0 && !mdev) { + i10nm_printk(KERN_ERR, "No hbm mc found\n"); + return -ENODEV; + } + if (!mdev) + continue; + + d->imc[lmc].mdev = mdev; + off = i * I10NM_HBM_IMC_MMIO_SIZE; + + edac_dbg(2, "hbm mc%d mmio base 0x%llx size 0x%x\n", + lmc, base + off, I10NM_HBM_IMC_MMIO_SIZE); + + mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE); + if (!mbase) { + pci_dev_put(d->imc[lmc].mdev); + d->imc[lmc].mdev = NULL; + + i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n", + base + off); + return -ENOMEM; + } + + d->imc[lmc].mbase = mbase; + d->imc[lmc].hbm_mc = true; + + mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0); + if (!I10NM_IS_HBM_IMC(mcmtr)) { + iounmap(d->imc[lmc].mbase); + d->imc[lmc].mbase = NULL; + d->imc[lmc].hbm_mc = false; + pci_dev_put(d->imc[lmc].mdev); + d->imc[lmc].mdev = NULL; + + i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n"); + return -ENODEV; + } + + lmc++; + } + } + + return 0; +} + static struct res_config i10nm_cfg0 = { .type = I10NM, .decs_did = 0x3452, .busno_cfg_offset = 0xcc, + .ddr_chan_mmio_sz = 0x4000, + .sad_all_devfn = PCI_DEVFN(29, 0), + .sad_all_offset = 0x108, + .offsets_scrub = offsets_scrub_icx, + .offsets_demand = offsets_demand_icx, }; static struct res_config i10nm_cfg1 = { .type = I10NM, .decs_did = 0x3452, .busno_cfg_offset = 0xd0, + .ddr_chan_mmio_sz = 0x4000, + .sad_all_devfn = PCI_DEVFN(29, 0), + .sad_all_offset = 0x108, + .offsets_scrub = offsets_scrub_icx, + .offsets_demand = offsets_demand_icx, +}; + +static struct res_config spr_cfg = { + .type = SPR, + .decs_did = 0x3252, + .busno_cfg_offset = 0xd0, + .ddr_chan_mmio_sz = 0x8000, + .hbm_chan_mmio_sz = 0x4000, + .support_ddr5 = true, + .sad_all_devfn = PCI_DEVFN(10, 0), + .sad_all_offset = 0x300, + .offsets_scrub = offsets_scrub_spr, + .offsets_demand = offsets_demand_spr, }; static const struct x86_cpu_id i10nm_cpuids[] = { @@ -143,6 +429,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); @@ -157,29 +444,31 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan) return !!GET_BITFIELD(mcmtr, 2, 2); } -static int i10nm_get_dimm_config(struct mem_ctl_info *mci) +static int i10nm_get_dimm_config(struct mem_ctl_info *mci, + struct res_config *cfg) { struct skx_pvt *pvt = mci->pvt_info; struct skx_imc *imc = pvt->imc; + u32 mtr, amap, mcddrtcfg; struct dimm_info *dimm; - u32 mtr, mcddrtcfg; int i, j, ndimms; - for (i = 0; i < I10NM_NUM_CHANNELS; i++) { + for (i = 0; i < imc->num_channels; i++) { if (!imc->mbase) continue; ndimms = 0; mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); - for (j = 0; j < I10NM_NUM_DIMMS; j++) { + amap = I10NM_GET_AMAP(imc, i); + for (j = 0; j < imc->num_dimms; j++) { dimm = edac_get_dimm(mci, i, j, 0); mtr = I10NM_GET_DIMMMTR(imc, i, j); edac_dbg(1, "dimmmtr 0x%x mcddrtcfg 0x%x (mc%d ch%d dimm%d)\n", mtr, mcddrtcfg, imc->mc, i, j); if (IS_DIMM_PRESENT(mtr)) - ndimms += skx_get_dimm_info(mtr, 0, 0, dimm, - imc, i, j); + ndimms += skx_get_dimm_info(mtr, 0, amap, dimm, + imc, i, j, cfg); else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) ndimms += skx_get_nvdimm_info(dimm, imc, i, j, EDAC_MOD_STR); @@ -271,6 +560,7 @@ static int __init i10nm_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + res_cfg = cfg; rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm); if (rc) @@ -284,8 +574,11 @@ static int __init i10nm_init(void) return -ENODEV; } - rc = i10nm_get_all_munits(); - if (rc < 0) + skx_set_mem_cfg(i10nm_check_2lm(cfg)); + + rc = i10nm_get_ddr_munits(); + + if (i10nm_get_hbm_munits() && rc) goto fail; list_for_each_entry(d, i10nm_edac_list, list) { @@ -306,10 +599,19 @@ static int __init i10nm_init(void) d->imc[i].lmc = i; d->imc[i].src_id = src_id; d->imc[i].node_id = node_id; + if (d->imc[i].hbm_mc) { + d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz; + d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS; + d->imc[i].num_dimms = I10NM_NUM_HBM_DIMMS; + } else { + d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz; + d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS; + d->imc[i].num_dimms = I10NM_NUM_DDR_DIMMS; + } rc = skx_register_mci(&d->imc[i], d->imc[i].mdev, "Intel_10nm Socket", EDAC_MOD_STR, - i10nm_get_dimm_config); + i10nm_get_dimm_config, cfg); if (rc < 0) goto fail; } @@ -323,6 +625,12 @@ static int __init i10nm_init(void) mce_register_decode_chain(&i10nm_mce_dec); setup_i10nm_debug(); + if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + skx_set_decode(NULL, show_retry_rd_err_log); + if (retry_rd_err_log == 2) + enable_retry_rd_err_log(true); + } + i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION); return 0; @@ -334,6 +642,13 @@ static int __init i10nm_init(void) static void __exit i10nm_exit(void) { edac_dbg(2, "\n"); + + if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + skx_set_decode(NULL, NULL); + if (retry_rd_err_log == 2) + enable_retry_rd_err_log(false); + } + teardown_i10nm_debug(); mce_unregister_decode_chain(&i10nm_mce_dec); skx_adxl_put(); @@ -343,5 +658,8 @@ static void __exit i10nm_exit(void) module_init(i10nm_init); module_exit(i10nm_exit); +module_param(retry_rd_err_log, int, 0444); +MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)"); + MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors"); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index f887e31666510514709166b585329553f752e08d..1abc020d49ab64a6654ab7b9b80f5140e17532d1 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -174,7 +174,7 @@ static bool skx_check_ecc(u32 mcmtr) return !!GET_BITFIELD(mcmtr, 2, 2); } -static int skx_get_dimm_config(struct mem_ctl_info *mci) +static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) { struct skx_pvt *pvt = mci->pvt_info; u32 mtr, mcmtr, amap, mcddrtcfg; @@ -195,7 +195,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci) pci_read_config_dword(imc->chan[i].cdev, 0x80 + 4 * j, &mtr); if (IS_DIMM_PRESENT(mtr)) { - ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j); + ndimms += skx_get_dimm_info(mtr, mcmtr, amap, dimm, imc, i, j, cfg); } else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) { ndimms += skx_get_nvdimm_info(dimm, imc, i, j, EDAC_MOD_STR); @@ -230,7 +230,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci) #define SKX_ILV_TARGET(tgt) ((tgt) & 7) static void skx_show_retry_rd_err_log(struct decoded_addr *res, - char *msg, int len) + char *msg, int len, + bool scrub_err) { u32 log0, log1, log2, log3, log4; u32 corr0, corr1, corr2, corr3; @@ -705,7 +706,7 @@ static int __init skx_init(void) d->imc[i].node_id = node_id; rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, "Skylake Socket", EDAC_MOD_STR, - skx_get_dimm_config); + skx_get_dimm_config, cfg); if (rc < 0) goto fail; } diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 2b4ce8e5ac2fa6e48721cea55e878784df57fdf4..19c17c5198c5f9d8bf9f97feae2d4c3b5669115a 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -23,10 +23,13 @@ #include "skx_common.h" static const char * const component_names[] = { - [INDEX_SOCKET] = "ProcessorSocketId", - [INDEX_MEMCTRL] = "MemoryControllerId", - [INDEX_CHANNEL] = "ChannelId", - [INDEX_DIMM] = "DimmSlotId", + [INDEX_SOCKET] = "ProcessorSocketId", + [INDEX_MEMCTRL] = "MemoryControllerId", + [INDEX_CHANNEL] = "ChannelId", + [INDEX_DIMM] = "DimmSlotId", + [INDEX_NM_MEMCTRL] = "NmMemoryControllerId", + [INDEX_NM_CHANNEL] = "NmChannelId", + [INDEX_NM_DIMM] = "NmDimmSlotId", }; static int component_indices[ARRAY_SIZE(component_names)]; @@ -34,12 +37,14 @@ static int adxl_component_count; static const char * const *adxl_component_names; static u64 *adxl_values; static char *adxl_msg; +static unsigned long adxl_nm_bitmap; static char skx_msg[MSG_SIZE]; static skx_decode_f skx_decode; static skx_show_retry_log_f skx_show_retry_rd_err_log; static u64 skx_tolm, skx_tohm; static LIST_HEAD(dev_edac_list); +static bool skx_mem_cfg_2lm; int __init skx_adxl_get(void) { @@ -56,14 +61,25 @@ int __init skx_adxl_get(void) for (j = 0; names[j]; j++) { if (!strcmp(component_names[i], names[j])) { component_indices[i] = j; + + if (i >= INDEX_NM_FIRST) + adxl_nm_bitmap |= 1 << i; + break; } } - if (!names[j]) + if (!names[j] && i < INDEX_NM_FIRST) goto err; } + if (skx_mem_cfg_2lm) { + if (!adxl_nm_bitmap) + skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n"); + else + edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap); + } + adxl_component_names = names; while (*names++) adxl_component_count++; @@ -99,7 +115,7 @@ void __exit skx_adxl_put(void) kfree(adxl_msg); } -static bool skx_adxl_decode(struct decoded_addr *res) +static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem) { struct skx_dev *d; int i, len = 0; @@ -116,11 +132,20 @@ static bool skx_adxl_decode(struct decoded_addr *res) } res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; - res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; - res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; - res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + if (error_in_1st_level_mem) { + res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ? + (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1; + res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ? + (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1; + res->dimm = (adxl_nm_bitmap & BIT_NM_DIMM) ? + (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1; + } else { + res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; + res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]]; + res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]]; + } - if (res->imc > NUM_IMC - 1) { + if (res->imc > NUM_IMC - 1 || res->imc < 0) { skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); return false; } @@ -151,6 +176,11 @@ static bool skx_adxl_decode(struct decoded_addr *res) return true; } +void skx_set_mem_cfg(bool mem_cfg_2lm) +{ + skx_mem_cfg_2lm = mem_cfg_2lm; +} + void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) { skx_decode = decode; @@ -304,14 +334,27 @@ static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add, #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols") int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, - struct skx_imc *imc, int chan, int dimmno) + struct skx_imc *imc, int chan, int dimmno, + struct res_config *cfg) { - int banks = 16, ranks, rows, cols, npages; + int banks, ranks, rows, cols, npages; + enum mem_type mtype; u64 size; ranks = numrank(mtr); rows = numrow(mtr); - cols = numcol(mtr); + cols = imc->hbm_mc ? 6 : numcol(mtr); + + if (imc->hbm_mc) { + banks = 32; + mtype = MEM_HBM2; + } else if (cfg->support_ddr5 && (amap & 0x8)) { + banks = 32; + mtype = MEM_DDR5; + } else { + banks = 16; + mtype = MEM_DDR4; + } /* * Compute size in 8-byte (2^3) words, then shift to MiB (2^20) @@ -332,10 +375,15 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, dimm->nr_pages = npages; dimm->grain = 32; dimm->dtype = get_width(mtr); - dimm->mtype = MEM_DDR4; + dimm->mtype = mtype; dimm->edac_mode = EDAC_SECDED; /* likely better than this */ - snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", - imc->src_id, imc->lmc, chan, dimmno); + + if (imc->hbm_mc) + snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u", + imc->src_id, imc->lmc, chan); + else + snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u", + imc->src_id, imc->lmc, chan, dimmno); return 1; } @@ -390,7 +438,8 @@ int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, const char *ctl_name, const char *mod_str, - get_dimm_config_f get_dimm_config) + get_dimm_config_f get_dimm_config, + struct res_config *cfg) { struct mem_ctl_info *mci; struct edac_mc_layer layers[2]; @@ -425,13 +474,15 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, } mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM; + if (cfg->support_ddr5) + mci->mtype_cap |= MEM_FLAG_DDR5; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; mci->mod_name = mod_str; mci->dev_name = pci_name(pdev); mci->ctl_page_to_phys = NULL; - rc = get_dimm_config(mci); + rc = get_dimm_config(mci, cfg); if (rc < 0) goto fail; @@ -481,6 +532,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); + bool scrub_err = false; bool recoverable; int len; u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52); @@ -532,6 +584,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, break; case 4: optype = "memory scrubbing error"; + scrub_err = true; break; default: optype = "reserved"; @@ -554,7 +607,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, } if (skx_show_retry_rd_err_log) - skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len); + skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err); edac_dbg(0, "%s\n", skx_msg); @@ -565,6 +618,21 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, optype, skx_msg); } +static bool skx_error_in_1st_level_mem(const struct mce *m) +{ + u32 errcode; + + if (!skx_mem_cfg_2lm) + return false; + + errcode = GET_BITFIELD(m->status, 0, 15); + + if ((errcode & 0xef80) != 0x280) + return false; + + return true; +} + int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data) { @@ -584,7 +652,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, res.addr = mce->addr; if (adxl_component_count) { - if (!skx_adxl_decode(&res)) + if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))) return NOTIFY_DONE; } else if (!skx_decode || !skx_decode(&res)) { return NOTIFY_DONE; @@ -645,6 +713,8 @@ void skx_remove(void) } if (d->util_all) pci_dev_put(d->util_all); + if (d->pcu_cr3) + pci_dev_put(d->pcu_cr3); if (d->sad_all) pci_dev_put(d->sad_all); if (d->uracu) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 78f8c1de0b71c80ddcdb7d64fc4bd63a064d0f3f..03ac067a80b9f779fe2ef726672f222ba4403be7 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -9,6 +9,8 @@ #ifndef _SKX_COMM_EDAC_H #define _SKX_COMM_EDAC_H +#include + #define MSG_SIZE 1024 /* @@ -30,9 +32,17 @@ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ -#define I10NM_NUM_IMC 4 -#define I10NM_NUM_CHANNELS 2 -#define I10NM_NUM_DIMMS 2 +#define I10NM_NUM_DDR_IMC 4 +#define I10NM_NUM_DDR_CHANNELS 2 +#define I10NM_NUM_DDR_DIMMS 2 + +#define I10NM_NUM_HBM_IMC 16 +#define I10NM_NUM_HBM_CHANNELS 2 +#define I10NM_NUM_HBM_DIMMS 1 + +#define I10NM_NUM_IMC (I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC) +#define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) +#define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) @@ -54,17 +64,24 @@ struct skx_dev { struct pci_dev *sad_all; struct pci_dev *util_all; struct pci_dev *uracu; /* for i10nm CPU */ + struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ void __iomem *mbase; /* for i10nm CPU */ + int chan_mmio_sz; /* for i10nm CPU */ + int num_channels; /* channels per memory controller */ + int num_dimms; /* dimms per channel */ + bool hbm_mc; u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id, node_id; struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; + u32 retry_rd_err_log_s; + u32 retry_rd_err_log_d; struct skx_dimm { u8 close_pg; u8 bank_xor_enable; @@ -82,7 +99,8 @@ struct skx_pvt { enum type { SKX, - I10NM + I10NM, + SPR }; enum { @@ -90,9 +108,17 @@ enum { INDEX_MEMCTRL, INDEX_CHANNEL, INDEX_DIMM, + INDEX_NM_FIRST, + INDEX_NM_MEMCTRL = INDEX_NM_FIRST, + INDEX_NM_CHANNEL, + INDEX_NM_DIMM, INDEX_MAX }; +#define BIT_NM_MEMCTRL BIT_ULL(INDEX_NM_MEMCTRL) +#define BIT_NM_CHANNEL BIT_ULL(INDEX_NM_CHANNEL) +#define BIT_NM_DIMM BIT_ULL(INDEX_NM_DIMM) + struct decoded_addr { struct skx_dev *dev; u64 addr; @@ -118,15 +144,28 @@ struct res_config { unsigned int decs_did; /* Default bus number configuration register offset */ int busno_cfg_offset; + /* Per DDR channel memory-mapped I/O size */ + int ddr_chan_mmio_sz; + /* Per HBM channel memory-mapped I/O size */ + int hbm_chan_mmio_sz; + bool support_ddr5; + /* SAD device number and function number */ + unsigned int sad_all_devfn; + int sad_all_offset; + /* Offsets of retry_rd_err_log registers */ + u32 *offsets_scrub; + u32 *offsets_demand; }; -typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci); +typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, + struct res_config *cfg); typedef bool (*skx_decode_f)(struct decoded_addr *res); -typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len); +typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err); int __init skx_adxl_get(void); void __exit skx_adxl_put(void); void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log); +void skx_set_mem_cfg(bool mem_cfg_2lm); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); int skx_get_node_id(struct skx_dev *d, u8 *id); @@ -136,14 +175,16 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list); int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm); int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, - struct skx_imc *imc, int chan, int dimmno); + struct skx_imc *imc, int chan, int dimmno, + struct res_config *cfg); int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, int chan, int dimmno, const char *mod_str); int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, const char *ctl_name, const char *mod_str, - get_dimm_config_f get_dimm_config); + get_dimm_config_f get_dimm_config, + struct res_config *cfg); int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data); diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 92906b56b1a2b9f64045fe2acacbed2b7a39fa7a..fea44dc0484b531feeba2bc0b1116e3175a1894b 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -163,6 +163,11 @@ #define ECC_STAT_CECNT_SHIFT 8 #define ECC_STAT_BITNUM_MASK 0x7F +/* ECC error count register definitions */ +#define ECC_ERRCNT_UECNT_MASK 0xFFFF0000 +#define ECC_ERRCNT_UECNT_SHIFT 16 +#define ECC_ERRCNT_CECNT_MASK 0xFFFF + /* DDR QOS Interrupt register definitions */ #define DDR_QOS_IRQ_STAT_OFST 0x20200 #define DDR_QOSUE_MASK 0x4 @@ -418,15 +423,16 @@ static int zynqmp_get_error_info(struct synps_edac_priv *priv) base = priv->baseaddr; p = &priv->stat; + regval = readl(base + ECC_ERRCNT_OFST); + p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK; + p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT; + if (!p->ce_cnt) + goto ue_err; + regval = readl(base + ECC_STAT_OFST); if (!regval) return 1; - p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT; - p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT; - if (!p->ce_cnt) - goto ue_err; - p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK); regval = readl(base + ECC_CEADDR0_OFST); diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index d18078748200933391555f6dccf834f06f22570c..59d8affad343a859063ac7b4a03ff29be419299e 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1612,8 +1612,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, gpiochip_set_irq_hooks(gc); - acpi_gpiochip_request_interrupts(gc); - /* * Using barrier() here to prevent compiler from reordering * gc->irq.initialized before initialization of above @@ -1623,6 +1621,8 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, gc->irq.initialized = true; + acpi_gpiochip_request_interrupts(gc); + return 0; } diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c index 83423092de2ff7cbdcb5ccb3b5fed7052e588477..da0799333970285d4d59e7cbc8a95ddd22ccbb11 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c @@ -179,7 +179,10 @@ static void mdp5_plane_reset(struct drm_plane *plane) drm_framebuffer_put(plane->state->fb); kfree(to_mdp5_plane_state(plane->state)); + plane->state = NULL; mdp5_state = kzalloc(sizeof(*mdp5_state), GFP_KERNEL); + if (!mdp5_state) + return; /* assign default blend parameters */ mdp5_state->alpha = 255; diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c index bbdd086be7f597f52c88dc8b16417fa97fd818c9..4b92c6341490583441b07c1a2397ce1b02da497f 100644 --- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c +++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c @@ -229,7 +229,7 @@ static void rpi_touchscreen_i2c_write(struct rpi_touchscreen *ts, ret = i2c_smbus_write_byte_data(ts->i2c, reg, val); if (ret) - dev_err(&ts->dsi->dev, "I2C write failed: %d\n", ret); + dev_err(&ts->i2c->dev, "I2C write failed: %d\n", ret); } static int rpi_touchscreen_write(struct rpi_touchscreen *ts, u16 reg, u32 val) @@ -265,7 +265,7 @@ static int rpi_touchscreen_noop(struct drm_panel *panel) return 0; } -static int rpi_touchscreen_enable(struct drm_panel *panel) +static int rpi_touchscreen_prepare(struct drm_panel *panel) { struct rpi_touchscreen *ts = panel_to_ts(panel); int i; @@ -295,6 +295,13 @@ static int rpi_touchscreen_enable(struct drm_panel *panel) rpi_touchscreen_write(ts, DSI_STARTDSI, 0x01); msleep(100); + return 0; +} + +static int rpi_touchscreen_enable(struct drm_panel *panel) +{ + struct rpi_touchscreen *ts = panel_to_ts(panel); + /* Turn on the backlight. */ rpi_touchscreen_i2c_write(ts, REG_PWM, 255); @@ -349,7 +356,7 @@ static int rpi_touchscreen_get_modes(struct drm_panel *panel, static const struct drm_panel_funcs rpi_touchscreen_funcs = { .disable = rpi_touchscreen_disable, .unprepare = rpi_touchscreen_noop, - .prepare = rpi_touchscreen_noop, + .prepare = rpi_touchscreen_prepare, .enable = rpi_touchscreen_enable, .get_modes = rpi_touchscreen_get_modes, }; diff --git a/drivers/gpu/drm/vc4/vc4_dsi.c b/drivers/gpu/drm/vc4/vc4_dsi.c index eaf276978ee7fb79a145868071ddd91f86fbdd12..ad84b56f4091daf75a841c41cb7bbd7c6c686883 100644 --- a/drivers/gpu/drm/vc4/vc4_dsi.c +++ b/drivers/gpu/drm/vc4/vc4_dsi.c @@ -835,7 +835,7 @@ static void vc4_dsi_encoder_enable(struct drm_encoder *encoder) unsigned long phy_clock; int ret; - ret = pm_runtime_get_sync(dev); + ret = pm_runtime_resume_and_get(dev); if (ret) { DRM_ERROR("Failed to runtime PM enable on DSI%d\n", dsi->port); return; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 0cf8ae8aeac8391f9c13011507dd35c526b20713..2fb4126ae8d8a3a3fd8b5e107aa1e459a6f039e6 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -480,8 +480,8 @@ int aq_nic_start(struct aq_nic_s *self) if (err < 0) goto err_exit; - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) { + for (i = 0U; self->aq_vecs > i; ++i) { + aq_vec = self->aq_vec[i]; err = aq_vec_start(aq_vec); if (err < 0) goto err_exit; @@ -511,8 +511,8 @@ int aq_nic_start(struct aq_nic_s *self) mod_timer(&self->polling_timer, jiffies + AQ_CFG_POLLING_TIMER_INTERVAL); } else { - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) { + for (i = 0U; self->aq_vecs > i; ++i) { + aq_vec = self->aq_vec[i]; err = aq_pci_func_alloc_irq(self, i, self->ndev->name, aq_vec_isr, aq_vec, aq_vec_get_affinity_mask(aq_vec)); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index 1826253f97dc40b8b9727730c397112317207bcf..bdfd462c74db93ce5d1089ed9117e2810a916775 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -450,22 +450,22 @@ static int atl_resume_common(struct device *dev, bool deep) static int aq_pm_freeze(struct device *dev) { - return aq_suspend_common(dev, false); + return aq_suspend_common(dev, true); } static int aq_pm_suspend_poweroff(struct device *dev) { - return aq_suspend_common(dev, true); + return aq_suspend_common(dev, false); } static int aq_pm_thaw(struct device *dev) { - return atl_resume_common(dev, false); + return atl_resume_common(dev, true); } static int aq_pm_resume_restore(struct device *dev) { - return atl_resume_common(dev, true); + return atl_resume_common(dev, false); } static const struct dev_pm_ops aq_pm_ops = { diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index f4774cf051c9780cfc434450673bb82d175e2736..6ab1f3212d2463a53ac35fc41f1aa3706d376066 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -43,8 +43,8 @@ static int aq_vec_poll(struct napi_struct *napi, int budget) if (!self) { err = -EINVAL; } else { - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; u64_stats_update_begin(&ring[AQ_VEC_RX_ID].stats.rx.syncp); ring[AQ_VEC_RX_ID].stats.rx.polls++; u64_stats_update_end(&ring[AQ_VEC_RX_ID].stats.rx.syncp); @@ -182,8 +182,8 @@ int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops, self->aq_hw_ops = aq_hw_ops; self->aq_hw = aq_hw; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; err = aq_ring_init(&ring[AQ_VEC_TX_ID], ATL_RING_TX); if (err < 0) goto err_exit; @@ -224,8 +224,8 @@ int aq_vec_start(struct aq_vec_s *self) unsigned int i = 0U; int err = 0; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; err = self->aq_hw_ops->hw_ring_tx_start(self->aq_hw, &ring[AQ_VEC_TX_ID]); if (err < 0) @@ -248,8 +248,8 @@ void aq_vec_stop(struct aq_vec_s *self) struct aq_ring_s *ring = NULL; unsigned int i = 0U; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; self->aq_hw_ops->hw_ring_tx_stop(self->aq_hw, &ring[AQ_VEC_TX_ID]); @@ -268,8 +268,8 @@ void aq_vec_deinit(struct aq_vec_s *self) if (!self) goto err_exit; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]); aq_ring_rx_deinit(&ring[AQ_VEC_RX_ID]); } @@ -297,8 +297,8 @@ void aq_vec_ring_free(struct aq_vec_s *self) if (!self) goto err_exit; - for (i = 0U, ring = self->ring[0]; - self->tx_rings > i; ++i, ring = self->ring[i]) { + for (i = 0U; self->tx_rings > i; ++i) { + ring = self->ring[i]; aq_ring_free(&ring[AQ_VEC_TX_ID]); if (i < self->rx_rings) aq_ring_free(&ring[AQ_VEC_RX_ID]); diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 78c6d133f54fad5d759b57adfe30ce256feb2cc7..3244f69555f717bc2e8bb5a1ae0390b03a007455 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1531,6 +1531,7 @@ static void macb_tx_restart(struct macb_queue *queue) unsigned int head = queue->tx_head; unsigned int tail = queue->tx_tail; struct macb *bp = queue->bp; + unsigned int head_idx, tbqp; if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) queue_writel(queue, ISR, MACB_BIT(TXUBR)); @@ -1538,6 +1539,13 @@ static void macb_tx_restart(struct macb_queue *queue) if (head == tail) return; + tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp); + tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp)); + head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head)); + + if (tbqp == head_idx) + return; + macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART)); } diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c index 763d2c7b5fb1a78225ad757e1aee7578ab310c36..5750f9a56393a038c3e50eb0502218f7696b33d0 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c @@ -489,11 +489,15 @@ static int dpaa_get_ts_info(struct net_device *net_dev, info->phc_index = -1; fman_node = of_get_parent(mac_node); - if (fman_node) + if (fman_node) { ptp_node = of_parse_phandle(fman_node, "ptimer-handle", 0); + of_node_put(fman_node); + } - if (ptp_node) + if (ptp_node) { ptp_dev = of_find_device_by_node(ptp_node); + of_node_put(ptp_node); + } if (ptp_dev) ptp = platform_get_drvdata(ptp_dev); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 15b1503d5b6ca9cd4aa7f8a7f28a2922aa90cca1..1f51252b465a6f3f746c21c5b8bc5c4ea6de1de4 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1006,8 +1006,8 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link) { u32 reg = link << (E1000_LTRV_REQ_SHIFT + E1000_LTRV_NOSNOOP_SHIFT) | link << E1000_LTRV_REQ_SHIFT | E1000_LTRV_SEND; - u16 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ - u16 lat_enc_d = 0; /* latency decoded */ + u32 max_ltr_enc_d = 0; /* maximum LTR decoded by platform */ + u32 lat_enc_d = 0; /* latency decoded */ u16 lat_enc = 0; /* latency encoded */ if (link) { diff --git a/drivers/net/ethernet/intel/igc/igc_i225.c b/drivers/net/ethernet/intel/igc/igc_i225.c index 553d6bc78e6bd7242dc24609b7cf01ca0697a402..624236a4202e50a96d2b4b47ae64209aa889aef6 100644 --- a/drivers/net/ethernet/intel/igc/igc_i225.c +++ b/drivers/net/ethernet/intel/igc/igc_i225.c @@ -156,8 +156,15 @@ void igc_release_swfw_sync_i225(struct igc_hw *hw, u16 mask) { u32 swfw_sync; - while (igc_get_hw_semaphore_i225(hw)) - ; /* Empty */ + /* Releasing the resource requires first getting the HW semaphore. + * If we fail to get the semaphore, there is nothing we can do, + * except log an error and quit. We are not allowed to hang here + * indefinitely, as it may cause denial of service or system crash. + */ + if (igc_get_hw_semaphore_i225(hw)) { + hw_dbg("Failed to release SW_FW_SYNC.\n"); + return; + } swfw_sync = rd32(IGC_SW_FW_SYNC); swfw_sync &= ~mask; diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index e380b7a3ea63b19468d976ac56a4ff29116b5bbb..8de4de2e56362d0cf9ddb782931c6cdb60fc5616 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -583,7 +583,7 @@ static s32 igc_read_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 *data) * the lower time out */ for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) { - usleep_range(500, 1000); + udelay(50); mdic = rd32(IGC_MDIC); if (mdic & IGC_MDIC_READY) break; @@ -640,7 +640,7 @@ static s32 igc_write_phy_reg_mdic(struct igc_hw *hw, u32 offset, u16 data) * the lower time out */ for (i = 0; i < IGC_GEN_POLL_TIMEOUT; i++) { - usleep_range(500, 1000); + udelay(50); mdic = rd32(IGC_MDIC); if (mdic & IGC_MDIC_READY) break; diff --git a/drivers/net/ethernet/micrel/Kconfig b/drivers/net/ethernet/micrel/Kconfig index 9ceb7e1fb1696f8ff0f22928090590f17f33a8a1..42bc014136fe30644391e55e565feb1fbc05179d 100644 --- a/drivers/net/ethernet/micrel/Kconfig +++ b/drivers/net/ethernet/micrel/Kconfig @@ -37,7 +37,6 @@ config KS8851 config KS8851_MLL tristate "Micrel KS8851 MLL" depends on HAS_IOMEM - depends on PTP_1588_CLOCK_OPTIONAL select MII select CRC32 select EEPROM_93CX6 diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 07b1b8374cd26b8d9f2a52aa1a2d80d39b0b0c7f..53efcc9c40e28d33e4ba0332c8c88d5f1cfcbedc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -68,9 +68,9 @@ static int init_systime(void __iomem *ioaddr, u32 sec, u32 nsec) writel(value, ioaddr + PTP_TCR); /* wait for present system time initialize to complete */ - return readl_poll_timeout(ioaddr + PTP_TCR, value, + return readl_poll_timeout_atomic(ioaddr + PTP_TCR, value, !(value & PTP_TCR_TSINIT), - 10000, 100000); + 10, 100000); } static int config_addend(void __iomem *ioaddr, u32 addend) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c362a6ac94c42547467c0152e0ff2f76184f299c..73953b0141ba3786e8ee47a77b6c9452108d210e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -710,11 +710,11 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) - return -ENOBUFS; + return -ENOMEM; if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { kfree(rd); - return -ENOBUFS; + return -ENOMEM; } rd->remote_ip = *ip; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 6d5d5c39c635960b7ab715b18d3a939a0615a7a2..9929e90866f04f4b60af9a7b6801e932b4638434 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -557,7 +557,7 @@ enum brcmf_sdio_frmtype { BRCMF_SDIO_FT_SUB, }; -#define SDIOD_DRVSTR_KEY(chip, pmu) (((chip) << 16) | (pmu)) +#define SDIOD_DRVSTR_KEY(chip, pmu) (((unsigned int)(chip) << 16) | (pmu)) /* SDIO Pad drive strength to select value mappings */ struct sdiod_drive_str { diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c index ecaf85b483ac3f5b5915861fb6ec6a24757252dc..e57e49a722dc0334279575a213d703ad3cc3ef9c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c @@ -80,7 +80,7 @@ mt76x2e_probe(struct pci_dev *pdev, const struct pci_device_id *id) mt76_rmw_field(dev, 0x15a10, 0x1f << 16, 0x9); /* RG_SSUSB_G1_CDR_BIC_LTR = 0xf */ - mt76_rmw_field(dev, 0x15a0c, 0xf << 28, 0xf); + mt76_rmw_field(dev, 0x15a0c, 0xfU << 28, 0xf); /* RG_SSUSB_CDR_BR_PE1D = 0x3 */ mt76_rmw_field(dev, 0x15c58, 0x3 << 6, 0x3); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dcc047f01a0761fdbae4ae887fedd83bdb87157e..274635c0c02a107dcc2b53c640287c91553a2c37 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1250,6 +1250,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_EUI64_LEN; memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); return NVME_NIDT_EUI64_LEN; case NVME_NIDT_NGUID: @@ -1258,6 +1260,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_NGUID_LEN; memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); return NVME_NIDT_NGUID_LEN; case NVME_NIDT_UUID: @@ -1266,6 +1270,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, warn_str, cur->nidl); return -1; } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_UUID_LEN; uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; case NVME_NIDT_CSI: @@ -1361,12 +1367,18 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, if ((*id)->ncap == 0) /* namespace not allocated or attached */ goto out_free_id; - if (ctrl->vs >= NVME_VS(1, 1, 0) && - !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); - if (ctrl->vs >= NVME_VS(1, 2, 0) && - !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { + dev_info(ctrl->device, + "Ignoring bogus Namespace Identifiers\n"); + } else { + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + } return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 94cee2c566d3229fcd177302e739a4ca91bd81e3..11d3cc2890f9aaeae88588ccdacee5783846685b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -151,6 +151,11 @@ enum nvme_quirks { * encoding the generation sequence number. */ NVME_QUIRK_SKIP_CID_GEN = (1 << 17), + + /* + * Reports garbage in the namespace identifiers (eui64, nguid, uuid). + */ + NVME_QUIRK_BOGUS_NID = (1 << 18), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f435ab0809fbccb8e9fac959a651f1d6fce741c2..e0a3d03198a2aebad0c7c887d70ada8475e71871 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3212,7 +3212,10 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS | - NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + NVME_QUIRK_DISABLE_WRITE_ZEROES | + NVME_QUIRK_BOGUS_NID, }, + { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index fe075d9f95e274403b92010526979aada28a65a9..c87faafbdba246ff76e44ecfbbf93a78ac027f05 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -398,6 +398,9 @@ validate_group(struct perf_event *event) if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; + if (event == leader) + return 0; + for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; @@ -487,12 +490,7 @@ __hw_perf_event_init(struct perf_event *event) local64_set(&hwc->period_left, hwc->sample_period); } - if (event->group_leader != event) { - if (validate_group(event) != 0) - return -EINVAL; - } - - return 0; + return validate_group(event); } static int armpmu_event_init(struct perf_event *event) diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c index d5cec6e35bb83618e74168bfbffd31de2f5bf2a8..0e456c39a603dd79edb787b1247239569b03e6d6 100644 --- a/drivers/platform/x86/samsung-laptop.c +++ b/drivers/platform/x86/samsung-laptop.c @@ -1121,8 +1121,6 @@ static void kbd_led_set(struct led_classdev *led_cdev, if (value > samsung->kbd_led.max_brightness) value = samsung->kbd_led.max_brightness; - else if (value < 0) - value = 0; samsung->kbd_led_wk = value; queue_work(samsung->led_workqueue, &samsung->kbd_led_work); diff --git a/drivers/reset/tegra/reset-bpmp.c b/drivers/reset/tegra/reset-bpmp.c index 24d3395964cc4ba2d3934a32299fef3f667cd45f..4c5bba52b10593890c9a95ccea929148db9cdc5f 100644 --- a/drivers/reset/tegra/reset-bpmp.c +++ b/drivers/reset/tegra/reset-bpmp.c @@ -20,6 +20,7 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc, struct tegra_bpmp *bpmp = to_tegra_bpmp(rstc); struct mrq_reset_request request; struct tegra_bpmp_message msg; + int err; memset(&request, 0, sizeof(request)); request.cmd = command; @@ -30,7 +31,13 @@ static int tegra_bpmp_reset_common(struct reset_controller_dev *rstc, msg.tx.data = &request; msg.tx.size = sizeof(request); - return tegra_bpmp_transfer(bpmp, &msg); + err = tegra_bpmp_transfer(bpmp, &msg); + if (err) + return err; + if (msg.rx.ret) + return -EINVAL; + + return 0; } static int tegra_bpmp_reset_module(struct reset_controller_dev *rstc, diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index ba9a22e55e32fdbe9f74988b2b581739578cf9ca..8003b3519b95ba04a499aeb954969d3c79952d80 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -828,6 +828,37 @@ static int qedi_task_xmit(struct iscsi_task *task) return qedi_iscsi_send_ioreq(task); } +static void qedi_offload_work(struct work_struct *work) +{ + struct qedi_endpoint *qedi_ep = + container_of(work, struct qedi_endpoint, offload_work); + struct qedi_ctx *qedi; + int wait_delay = 5 * HZ; + int ret; + + qedi = qedi_ep->qedi; + + ret = qedi_iscsi_offload_conn(qedi_ep); + if (ret) { + QEDI_ERR(&qedi->dbg_ctx, + "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n", + qedi_ep->iscsi_cid, qedi_ep, ret); + qedi_ep->state = EP_STATE_OFLDCONN_FAILED; + return; + } + + ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait, + (qedi_ep->state == + EP_STATE_OFLDCONN_COMPL), + wait_delay); + if (ret <= 0 || qedi_ep->state != EP_STATE_OFLDCONN_COMPL) { + qedi_ep->state = EP_STATE_OFLDCONN_FAILED; + QEDI_ERR(&qedi->dbg_ctx, + "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n", + qedi_ep->iscsi_cid, qedi_ep); + } +} + static struct iscsi_endpoint * qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, int non_blocking) @@ -876,6 +907,7 @@ qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, } qedi_ep = ep->dd_data; memset(qedi_ep, 0, sizeof(struct qedi_endpoint)); + INIT_WORK(&qedi_ep->offload_work, qedi_offload_work); qedi_ep->state = EP_STATE_IDLE; qedi_ep->iscsi_cid = (u32)-1; qedi_ep->qedi = qedi; @@ -1026,12 +1058,11 @@ static void qedi_ep_disconnect(struct iscsi_endpoint *ep) qedi_ep = ep->dd_data; qedi = qedi_ep->qedi; + flush_work(&qedi_ep->offload_work); + if (qedi_ep->state == EP_STATE_OFLDCONN_START) goto ep_exit_recover; - if (qedi_ep->state != EP_STATE_OFLDCONN_NONE) - flush_work(&qedi_ep->offload_work); - if (qedi_ep->conn) { qedi_conn = qedi_ep->conn; conn = qedi_conn->cls_conn->dd_data; @@ -1196,37 +1227,6 @@ static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid) return rc; } -static void qedi_offload_work(struct work_struct *work) -{ - struct qedi_endpoint *qedi_ep = - container_of(work, struct qedi_endpoint, offload_work); - struct qedi_ctx *qedi; - int wait_delay = 5 * HZ; - int ret; - - qedi = qedi_ep->qedi; - - ret = qedi_iscsi_offload_conn(qedi_ep); - if (ret) { - QEDI_ERR(&qedi->dbg_ctx, - "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n", - qedi_ep->iscsi_cid, qedi_ep, ret); - qedi_ep->state = EP_STATE_OFLDCONN_FAILED; - return; - } - - ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait, - (qedi_ep->state == - EP_STATE_OFLDCONN_COMPL), - wait_delay); - if ((ret <= 0) || (qedi_ep->state != EP_STATE_OFLDCONN_COMPL)) { - qedi_ep->state = EP_STATE_OFLDCONN_FAILED; - QEDI_ERR(&qedi->dbg_ctx, - "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n", - qedi_ep->iscsi_cid, qedi_ep); - } -} - static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data) { struct qedi_ctx *qedi; @@ -1342,7 +1342,6 @@ static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data) qedi_ep->dst_addr, qedi_ep->dst_port); } - INIT_WORK(&qedi_ep->offload_work, qedi_offload_work); queue_work(qedi->offload_thread, &qedi_ep->offload_work); ret = 0; diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index 1e63fd4821f9643b3b35c0e403a863c1693e60e7..8aa89d93db118ffc45452108348b455b97d62d0a 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -277,6 +277,9 @@ static int atmel_qspi_find_mode(const struct spi_mem_op *op) static bool atmel_qspi_supports_op(struct spi_mem *mem, const struct spi_mem_op *op) { + if (!spi_mem_default_supports_op(mem, op)) + return false; + if (atmel_qspi_find_mode(op) < 0) return false; diff --git a/drivers/spi/spi-mtk-nor.c b/drivers/spi/spi-mtk-nor.c index 288f6c2bbd573073347f5f290a498ae29125ce31..106e3cacba4c3cd8321dc2afc8faaf999ea6e7dd 100644 --- a/drivers/spi/spi-mtk-nor.c +++ b/drivers/spi/spi-mtk-nor.c @@ -895,7 +895,17 @@ static int __maybe_unused mtk_nor_suspend(struct device *dev) static int __maybe_unused mtk_nor_resume(struct device *dev) { - return pm_runtime_force_resume(dev); + struct spi_controller *ctlr = dev_get_drvdata(dev); + struct mtk_nor *sp = spi_controller_get_devdata(ctlr); + int ret; + + ret = pm_runtime_force_resume(dev); + if (ret) + return ret; + + mtk_nor_init(sp); + + return 0; } static const struct dev_pm_ops mtk_nor_pm_ops = { diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index e1fe03ceb7f13cf577e608c5e9664c1bcda3368a..e6d4a3ee6cda5fada4c91939238f8d6a502b74a5 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -114,6 +114,9 @@ static void *ion_buffer_kmap_get(struct ion_buffer *buffer) void *vaddr; if (buffer->kmap_cnt) { + if (buffer->kmap_cnt == INT_MAX) + return ERR_PTR(-EOVERFLOW); + buffer->kmap_cnt++; return buffer->vaddr; } diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index f102519ccefb43be0fb7806e7d0c716c51d1fdb1..b4260a830e78a884ca72b04d8b2e089020ed3f2b 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2510,6 +2510,11 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, if (charcount != 256 && charcount != 512) return -EINVAL; + /* font bigger than screen resolution ? */ + if (w > FBCON_SWAP(info->var.rotate, info->var.xres, info->var.yres) || + h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres)) + return -EINVAL; + /* Make sure drawing engine can handle the font */ if (!(info->pixmap.blit_x & (1 << (font->width - 1))) || !(info->pixmap.blit_y & (1 << (font->height - 1)))) @@ -2771,6 +2776,34 @@ void fbcon_update_vcs(struct fb_info *info, bool all) } EXPORT_SYMBOL(fbcon_update_vcs); +/* let fbcon check if it supports a new screen resolution */ +int fbcon_modechange_possible(struct fb_info *info, struct fb_var_screeninfo *var) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct vc_data *vc; + unsigned int i; + + WARN_CONSOLE_UNLOCKED(); + + if (!ops) + return 0; + + /* prevent setting a screen size which is smaller than font size */ + for (i = first_fb_vc; i <= last_fb_vc; i++) { + vc = vc_cons[i].d; + if (!vc || vc->vc_mode != KD_TEXT || + registered_fb[con2fb_map[i]] != info) + continue; + + if (vc->vc_font.width > FBCON_SWAP(var->rotate, var->xres, var->yres) || + vc->vc_font.height > FBCON_SWAP(var->rotate, var->yres, var->xres)) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fbcon_modechange_possible); + int fbcon_mode_deleted(struct fb_info *info, struct fb_videomode *mode) { diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 00939ca2065a947c7c741191f99120d4f74a916f..3b3ccb2355220cc7bf4d176154dbe2a8f4207fc4 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1019,6 +1019,16 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var) if (ret) return ret; + /* verify that virtual resolution >= physical resolution */ + if (var->xres_virtual < var->xres || + var->yres_virtual < var->yres) { + pr_warn("WARNING: fbcon: Driver '%s' missed to adjust virtual screen size (%ux%u vs. %ux%u)\n", + info->fix.id, + var->xres_virtual, var->yres_virtual, + var->xres, var->yres); + return -EINVAL; + } + if ((var->activate & FB_ACTIVATE_MASK) != FB_ACTIVATE_NOW) return 0; @@ -1109,7 +1119,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, return -EFAULT; console_lock(); lock_fb_info(info); - ret = fb_set_var(info, &var); + ret = fbcon_modechange_possible(info, &var); + if (!ret) + ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index 3065dd670a18289df900c07c93f513c4d9c8f598..1bd4c8d89ed41d93baa224361cbf8a02fb47e7bc 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -344,6 +344,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) wdat->period = tbl->timer_period; wdat->wdd.min_hw_heartbeat_ms = wdat->period * tbl->min_count; wdat->wdd.max_hw_heartbeat_ms = wdat->period * tbl->max_count; + wdat->wdd.min_timeout = 1; wdat->stopped_in_sleep = tbl->flags & ACPI_WDAT_STOPPED; wdat->wdd.info = &wdat_wdt_info; wdat->wdd.ops = &wdat_wdt_ops; @@ -450,8 +451,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) * watchdog properly after it has opened the device. In some cases * the BIOS default is too short and causes immediate reboot. */ - if (timeout * 1000 < wdat->wdd.min_hw_heartbeat_ms || - timeout * 1000 > wdat->wdd.max_hw_heartbeat_ms) { + if (watchdog_timeout_invalid(&wdat->wdd, timeout)) { dev_warn(dev, "Invalid timeout %d given, using %d\n", timeout, WDAT_DEFAULT_TIMEOUT); timeout = WDAT_DEFAULT_TIMEOUT; @@ -462,6 +462,8 @@ static int wdat_wdt_probe(struct platform_device *pdev) return ret; watchdog_set_nowayout(&wdat->wdd, nowayout); + watchdog_stop_on_reboot(&wdat->wdd); + watchdog_stop_on_unregister(&wdat->wdd); return devm_watchdog_register_device(dev, &wdat->wdd); } diff --git a/fs/Kconfig b/fs/Kconfig index 6e723c90a506c2e72189b0c7b38cd6226f3c666e..aa097ca64ef6ab9d61413dea9faea5cbc3d9d89a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -237,19 +237,27 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS -config HUGETLB_PAGE_FREE_VMEMMAP +# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + bool + +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 || ARM64 + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off. config DYNAMIC_HUGETLB diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index aa5a4d759ca236a10df969bed3f0080cb48e582e..370188b2a55d2cf74df1870e1519434f6bd96113 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -898,7 +898,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return cifs_user_readv(iocb, iter); rc = cifs_revalidate_mapping(inode); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 277f89d5de038ddb29d08167fa4159f86f714da1..df43cda8fc2fa8ce57f3b054817ca8de3241b66d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2908,7 +2908,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); -extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4323186bae78b3c96399a2b23fec0e20e451c963..9d06695c04ab99bd69259e9de1dbca9a78afa90c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4510,9 +4510,9 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, return ret > 0 ? ret2 : ret; } -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) @@ -4583,6 +4583,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4707,17 +4711,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto exit; if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(inode, offset, len); + ret = ext4_punch_hole(file, offset, len); goto exit; } if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(inode, offset, len); + ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(inode, offset, len); + ret = ext4_insert_range(file, offset, len); goto exit; } @@ -4753,6 +4757,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out; + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; @@ -5255,8 +5263,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t punch_start, punch_stop; handle_t *handle; @@ -5307,6 +5316,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. @@ -5401,8 +5414,9 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * by len bytes. * Returns 0 on success, error otherwise. */ -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; handle_t *handle; struct ext4_ext_path *path; @@ -5458,6 +5472,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e85c238edd8545fbac91d1c49d583bf84a75a240..f4e0c7cc48200ef6873de14f62f84bb10c046e2d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3972,12 +3972,14 @@ int ext4_break_layouts(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset; + loff_t first_block_offset, last_block_offset, max_length; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; @@ -4011,6 +4013,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) offset; } + /* + * For punch hole the length + offset needs to be within one block + * before last range. Adjust the length if it goes beyond that limit. + */ + max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; + if (offset + length > max_length) + length = max_length - offset; + if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* @@ -4026,6 +4036,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 26c9383608950d5d19c1e8193f0d3af7a27eab3f..706a159817eebe1d065732663dcff5fe2d3c7d65 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3912,9 +3912,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) - return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + @@ -4975,9 +4977,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Get the # of file system overhead blocks from the * superblock if present. */ - if (es->s_overhead_clusters) - sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); - else { + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index dc55b029afaa4c3494b6e09770b53f7bbf97bde5..c5bde789a16dbf3df01f8d3bae8d1eacaea006b7 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -906,15 +906,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); spin_lock_init(&rgd->rd_rsspin); - error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail; + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -928,6 +928,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) } error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl); fail: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6f2943465bff1debc5c1fc9fcc3360218a1afbcb..8a87d1b433875e437a36492f3cf31cc940af1c0e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -252,7 +252,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -272,7 +272,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -291,7 +291,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags); @@ -309,6 +309,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -328,7 +329,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3ec494f5d7ee73c35ccf3cfbb8fd048d4432d8fc..d5246e277f179fd8b309ba515b1d135f1006411d 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -771,10 +771,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. */ if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; @@ -791,25 +787,24 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - copied = iomap_write_end(inode, pos, bytes, copied, page, iomap, + status = iomap_write_end(inode, pos, bytes, copied, page, iomap, srcmap); cond_resched(); - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { + if (unlikely(status == 0)) { /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. + * A short copy made iomap_write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); + if (copied) + bytes = copied; goto again; } + copied = status; + iov_iter_advance(i, copied); pos += copied; written += copied; length -= copied; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index f0d6b54be412e5acbdc282faa39ddfa32984e658..765b50aeadd28b9718ec7d60bbdb2bbc2ede6e37 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -83,16 +83,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { - /* - * IN_ALL_EVENTS represents all of the mask bits - * that we expose to userspace. There is at - * least one bit (FS_EVENT_ON_CHILD) which is - * used only internally to the kernel. - */ - u32 mask = mark->mask & IN_ALL_EVENTS; - seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", + seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mask, mark->ignored_mask); + inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 2007e371191600690306c0dbfa572cde6fcf2854..8f00151eb731f9dfd11acf99f5a7cf610ccf7b0a 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -22,6 +22,18 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct inotify_event_info, fse); } +/* + * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to + * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ +#define INOTIFY_USER_MASK (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK) + +static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark) +{ + return fsn_mark->mask & INOTIFY_USER_MASK; +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5f6c6bf65909cd7daf91f7b0d07d1286e84e8520..3986f18774571aadd57f459b200de6b2c0c60b25 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -46,22 +46,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #include +static long it_zero = 0; +static long it_int_max = INT_MAX; + struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_queued_events", @@ -88,7 +93,7 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) mask |= FS_EVENT_ON_CHILD; /* mask off the flags used to open the fd */ - mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)); + mask |= (arg & INOTIFY_USER_MASK); return mask; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ffed75f833b7b362a1e5266f94d76c989b467b98..df435cd91a5bd50e492dc09962f8b2d3950db713 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1380,6 +1381,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab } EXPORT_SYMBOL(register_sysctl); +/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base + * @table: This is the sysctl table that needs to be registered to the path + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: Can only be called after your respective sysctl base path has been + * registered. So for instance, most base directories are registered early on + * init before init levels are processed through proc_sys_init() and + * sysctl_init(). + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + static char *append_path(const char *path, char *pos, const char *name) { int namelen; diff --git a/fs/stat.c b/fs/stat.c index 1196af4d1ea03dfec6d003ed3233be4c21080805..04550c0ba5407307cbaaeffac0bef4fdc9f4cbfc 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -306,9 +306,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev(x),true) -#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif @@ -317,7 +314,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; - if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -325,7 +324,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) #endif INIT_STRUCT_STAT_PADDING(tmp); - tmp.st_dev = encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -335,7 +334,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -616,11 +615,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -630,7 +631,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4fe121b6963888de66408f972c86c..1dc22cf29c6549c0aeb14833c32836d8432108c1 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } +/** + * ubifs_encrypt - Encrypt data. + * @inode: inode which refers to the data node + * @dn: data node to encrypt + * @in_len: length of data to be compressed + * @out_len: allocated memory size for the data area of @dn + * @block: logical block number of the block + * + * This function encrypt a possibly-compressed data in the data node. + * The encrypted data length will store in @out_len. + */ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f5777f59a1012510626685aa72225100c73b62c7..acd7e83a35e4021afd109632d1abb53833068c7d 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @c: UBIFS file-system description object * @dir: parent directory inode * @mode: inode mode flags + * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and * initializes it. Returns new inode in case of success and an error code in * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode) + umode_t mode, bool is_xattr) { int err; struct inode *inode; @@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; - err = fscrypt_prepare_new_inode(dir, inode, &encrypted); - if (err) { - ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); - goto out_iput; + if (!is_xattr) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } } switch (mode & S_IFMT) { @@ -308,7 +311,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -369,7 +372,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) if (err) return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_free; @@ -461,7 +464,7 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, return err; } - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; @@ -1002,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -1089,7 +1092,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); @@ -1171,7 +1174,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 72586512e51f1a05029efde00a61a78508d22be0..ee9888087983799ca84d639a5155f4892141c78f 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1472,23 +1472,25 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, * @block: data block number * @dn: data node to re-compress * @new_len: new length + * @dn_size: size of the data node @dn in memory * * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed/encrypted and re-written. */ static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, unsigned int block, struct ubifs_data_node *dn, - int *new_len) + int *new_len, int dn_size) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + data_size = dn_size - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); if (IS_ENCRYPTED(inode)) { @@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in } if (IS_ENCRYPTED(inode)) { - err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + err = ubifs_encrypt(inode, dn, out_len, &data_size, block); if (err) goto out; - out_len = old_dlen; + out_len = data_size; } else { dn->compr_size = 0; } @@ -1549,7 +1551,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_ino_node *ino; struct ubifs_trun_node *trun; struct ubifs_data_node *dn; - int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + int err, dlen, len, lnum, offs, bit, sz, dn_size, sync = IS_SYNC(inode); struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; @@ -1562,10 +1564,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_assert(c, S_ISREG(inode->i_mode)); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); - sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + - UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + dn_size = COMPRESSED_DATA_NODE_BUF_SZ; + + if (IS_ENCRYPTED(inode)) + dn_size += UBIFS_CIPHER_BLOCK_SIZE; - sz += ubifs_auth_node_sz(c); + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + dn_size + ubifs_auth_node_sz(c); ino = kmalloc(sz, GFP_NOFS); if (!ino) @@ -1596,15 +1601,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - - UBIFS_TRUN_NODE_SZ); + ubifs_dump_node(c, dn, dn_size); goto out_free; } if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { - err = truncate_data_node(c, inode, blk, dn, &dlen); + err = truncate_data_node(c, inode, blk, dn, + &dlen, dn_size); if (err) goto out_free; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 9a4a3191ed07ca0b1ca2f53da66dd37d4e260271..15cfee7c1125e835e69eb13ae1bc3162b290f852 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1996,7 +1996,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode); + umode_t mode, bool is_xattr); int ubifs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 258b97939ba2952f85237cee2ded92e9dc606f6b..81efe638acd55dd3d66691a4f19a227fe6f4e968 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; diff --git a/include/linux/edac.h b/include/linux/edac.h index 15e8f3d8a895e01130929fbe37aaa9bab8c1aa3a..4901344950080ce5e034fbaf942068e63433851c 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -179,7 +179,9 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_RDDR4: Registered DDR4 RAM * This is a variant of the DDR4 memories. * @MEM_LRDDR4: Load-Reduced DDR4 memory. + * @MEM_DDR5: Unbuffered DDR5 RAM * @MEM_NVDIMM: Non-volatile RAM + * @MEM_HBM2: High bandwidth Memory Gen 2. */ enum mem_type { MEM_EMPTY = 0, @@ -203,7 +205,9 @@ enum mem_type { MEM_DDR4, MEM_RDDR4, MEM_LRDDR4, + MEM_DDR5, MEM_NVDIMM, + MEM_HBM2, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -226,7 +230,9 @@ enum mem_type { #define MEM_FLAG_DDR4 BIT(MEM_DDR4) #define MEM_FLAG_RDDR4 BIT(MEM_RDDR4) #define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) +#define MEM_FLAG_DDR5 BIT(MEM_DDR5) #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) +#define MEM_FLAG_HBM2 BIT(MEM_HBM2) /** * enum edac-type - Error Detection and Correction capabilities and mode diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2e5debc0373c54c1a41e8c864357bd45cffb51c6..99209f50915f4d4aa64f092bdf2772a5c8868d3c 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -127,7 +127,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr) #endif } -static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN @@ -352,8 +352,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ -static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], - const u8 addr2[6+2]) +static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h index ff5596dd30f85575dc7f6dfcbd2212022e0ea25e..2382dec6d6ab8e0276e8e87489300fa33741dbf5 100644 --- a/include/linux/fbcon.h +++ b/include/linux/fbcon.h @@ -15,6 +15,8 @@ void fbcon_new_modelist(struct fb_info *info); void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps); void fbcon_fb_blanked(struct fb_info *info, int blank); +int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var); void fbcon_update_vcs(struct fb_info *info, bool all); void fbcon_remap_all(struct fb_info *info); int fbcon_set_con2fb_map_ioctl(void __user *argp); @@ -33,6 +35,8 @@ static inline void fbcon_new_modelist(struct fb_info *info) {} static inline void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps) {} static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {} +static inline int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var) { return 0; } static inline void fbcon_update_vcs(struct fb_info *info, bool all) {} static inline void fbcon_remap_all(struct fb_info *info) {} static inline int fbcon_set_con2fb_map_ioctl(void __user *argp) { return 0; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 634630ebc8a7e7d42462b9dab3aa5a61c0ce2185..0dfe084390954a30044184870cd5b85b6e1a8fb1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -592,8 +592,8 @@ struct hstate { unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; unsigned int resv_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP - unsigned int nr_free_vmemmap_pages; +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ @@ -1103,12 +1103,6 @@ static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, } #endif /* CONFIG_HUGETLB_PAGE */ -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; -#else -#define hugetlb_free_vmemmap_enabled false -#endif - static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7e1dce5670fc7971477a626651f75244ce0cc02d..d01c23025af02d1bc4d1f9af24e7c272eaa05531 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -388,6 +388,21 @@ struct static_key_false { [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } +#define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) +#define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) +#define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ + __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) + +#define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) +#define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) +#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ + __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) + +#define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) +#define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) +#define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ + __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) + extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ @@ -488,6 +503,10 @@ extern bool ____wrong_branch_error(void); #endif /* CONFIG_JUMP_LABEL */ +#define static_branch_maybe(config, x) \ + (IS_ENABLED(config) ? static_branch_likely(x) \ + : static_branch_unlikely(x)) + /* * Advanced usage; refcount, branch is enabled when: count != 0 */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 25891b581bf4e2d0352d5754bef0071053f7b4c3..1ae73cc4b80605bd1055dd7cf1f42fa436c728ff 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3104,10 +3104,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, gfp_t gfp_mask); +#endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 65e1cbe1d1ce66eaf05c6de381ca496bb3a18085..26b36cac9307302f24ebda043da4b666242b1be5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,25 +194,94 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H -struct page; /* forward declaration */ +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, + hugetlb_optimize_vmemmap_key); -static inline struct page *compound_head(struct page *page) +static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) +{ + return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, + &hugetlb_optimize_vmemmap_key); +} + +/* + * If the feature of optimizing vmemmap pages associated with each HugeTLB + * page is enabled, the head vmemmap page frame is reused and all of the tail + * vmemmap addresses map to the head vmemmap page frame (furture details can + * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other + * words, there are more than one page struct with PG_head associated with each + * HugeTLB page. We __know__ that there is only one head page struct, the tail + * page structs with PG_head are fake head page structs. We need an approach + * to distinguish between those two different types of page structs so that + * compound_head() can return the real head page struct when the parameter is + * the tail page struct but with PG_head. + * + * The page_fixed_fake_head() returns the real head page struct if the @page is + * fake page head, otherwise, returns @page which can either be a true page + * head or tail. + */ +static __always_inline const struct page *page_fixed_fake_head(const struct page *page) +{ + if (!hugetlb_optimize_vmemmap_enabled()) + return page; + + /* + * Only addresses aligned with PAGE_SIZE of struct page may be fake head + * struct page. The alignment check aims to avoid access the fields ( + * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * cold cacheline in some cases. + */ + if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && + test_bit(PG_head, &page->flags)) { + /* + * We can safely access the field of the @page[1] with PG_head + * because the @page is a compound page composed with at least + * two contiguous pages. + */ + unsigned long head = READ_ONCE(page[1].compound_head); + + if (likely(head & 1)) + return (const struct page *)(head - 1); + } + return page; +} +#else +static inline const struct page *page_fixed_fake_head(const struct page *page) +{ + return page; +} + +static inline bool hugetlb_optimize_vmemmap_enabled(void) +{ + return false; +} +#endif + +static __always_inline int page_is_fake_head(struct page *page) +{ + return page_fixed_fake_head(page) != page; +} + +static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) - return (struct page *) (head - 1); - return page; + return head - 1; + return (unsigned long)page_fixed_fake_head(page); } +#define compound_head(page) ((typeof(page))_compound_head(page)) + static __always_inline int PageTail(struct page *page) { - return READ_ONCE(page->compound_head) & 1; + return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(struct page *page) { - return test_bit(PG_head, &page->flags) || PageTail(page); + return test_bit(PG_head, &page->flags) || + READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l @@ -600,7 +669,15 @@ static inline void set_page_writeback_keepwrite(struct page *page) test_set_page_writeback_keepwrite(page); } -__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +static __always_inline int PageHead(struct page *page) +{ + PF_POISONED_CHECK(page); + return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); +} + +__SETPAGEFLAG(Head, head, PF_ANY) +__CLEARPAGEFLAG(Head, head, PF_ANY) +CLEARPAGEFLAG(Head, head, PF_ANY) static __always_inline void set_compound_head(struct page *page, struct page *head) { diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index dc1f4dcd9a82531dd39df5833f25dad5a3919625..e3e5e149b00e6b0c99418096c38613771ecdab28 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -106,6 +106,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 68efccc15a879860ea158ab633fe4ed0e37c0916..72cfb047fd2c012f4353200af140ad908908c7d6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2251,6 +2251,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) #endif /* NET_SKBUFF_DATA_USES_OFFSET */ +static inline void skb_assert_len(struct sk_buff *skb) +{ +#ifdef CONFIG_DEBUG_NET + if (WARN_ONCE(!skb->len, "%s\n", __func__)) + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); +#endif /* CONFIG_DEBUG_NET */ +} + /* * Add data to an sk_buff */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 0d429a102d417955192d610b37648966642f0b30..708fbeb21dd397c8a3511d1368f389cc5124d280 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -332,6 +332,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } +static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry) +{ + return swp_offset(entry); +} + static inline void num_poisoned_pages_inc(void) { atomic_long_inc(&num_poisoned_pages); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 51298a4f4623573a6628718193331fb4f31d5469..161eba9fd9122c52698b6a479d7c721a4f19f280 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -195,6 +195,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name); +#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) void do_sysctl_args(void); extern int pwrsw_enabled; diff --git a/include/net/esp.h b/include/net/esp.h index 90cd02ff77ef67f7f65e2c53127c4510c23bd4a9..9c5637d41d95168052686caf7b3ff51b517e6b9b 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,8 +4,6 @@ #include -#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) - struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index b2a28201f4fdea03abae09651237b644eb12b57d..31d0cf3c73771053799fe8a03a91925fb99ac78f 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -79,8 +79,8 @@ struct netns_ipv6 { struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + KABI_REPLACE(unsigned int ip6_rt_gc_expire, atomic_t ip6_rt_gc_expire) + unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index 4bd9dd6c3b72cc287ca4ec819e4e5b5ee8f0fabc..68dc8a8e7990a97c675b547f593056630e46b59d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6174,7 +6174,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 228801e2078869e5d0fbe4618deca7b1dc88d9a8..aa23ffdaf819fb08fd8f1b69701215a697fd3250 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb) } #endif +static inline int data_page_nr(struct perf_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} + static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ef91ae75ca56f1c991047306edf57c499200b2d5..4032cd47500013ccad44618e5a5cefb9e8657052 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -856,11 +856,6 @@ void rb_free(struct perf_buffer *rb) } #else -static int data_page_nr(struct perf_buffer *rb) -{ - return rb->nr_pages << page_order(rb); -} - static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50d457979db61fa44f73f64c2a082f30fbeab8ee..09f002f1fe5de2378976b5994e175d24e40a1232 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3771,11 +3771,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d0309de2f84fea5bfce5cbe4a8321be92c547057..3c6229f16e81d179658305ae551916ab6a724955 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1219,7 +1219,12 @@ static void stacktrace_trigger(struct event_trigger_data *data, void *rec, struct ring_buffer_event *event) { - trace_dump_stack(STACK_SKIP); + struct trace_event_file *file = data->private_data; + + if (file) + __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + else + trace_dump_stack(STACK_SKIP); } static void diff --git a/kernel/ucount.c b/kernel/ucount.c index dff1d9b739d2645015e06417f4f425e786d04e2d..1f5825b674d8eacd07c03df02a5c44021f65b83a 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -52,14 +52,17 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, }; -#define UCOUNT_ENTRY(name) \ - { \ - .procname = name, \ - .maxlen = sizeof(int), \ - .mode = 0644, \ - .proc_handler = proc_dointvec_minmax, \ - .extra1 = SYSCTL_ZERO, \ - .extra2 = SYSCTL_INT_MAX, \ +static long ue_zero = 0; +static long ue_int_max = INT_MAX; + +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(long), \ + .mode = 0644, \ + .proc_handler = proc_doulongvec_minmax, \ + .extra1 = &ue_zero, \ + .extra2 = &ue_int_max, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), diff --git a/mm/Makefile b/mm/Makefile index d2a6a786f9153bdc5e9ddeab66ba5198e41503cc..e83233177c7af2e8105c223a985cd293e18dbc04 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o -obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o +obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_DYNAMIC_HUGETLB) += dynamic_hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index eb9b528b73de1eae9504e66d252e6bde3fbd5500..c1b968a7e668bc302b25ac48d38132ebe6cae971 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -1133,17 +1133,6 @@ void __init dynamic_hugetlb_init(void) if (!enable_dhugetlb) return; - /* - * The dynamic_hugetlb feature need to split and merge pages frequently. - * hugetlb_vmemmap will affects the perforemance of page split and merge. - * If want to use dynamic hugetlb, please disable hugetlb_vmemmap. - */ - if (hugetlb_free_vmemmap_enabled) { - enable_dhugetlb = false; - pr_info("Please set hugetlb_free_vmemmap=off if want to enable dynamic hugetlb\n"); - return; - } - count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT); size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *); dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL); @@ -1161,6 +1150,6 @@ static int __init dynamic_hugetlb_setup(char *s) { if (!strcmp(s, "on")) enable_dhugetlb = true; - return 1; + return 0; } -__setup("dynamic_hugetlb=", dynamic_hugetlb_setup); +early_param("dynamic_hugetlb", dynamic_hugetlb_setup); diff --git a/mm/filemap.c b/mm/filemap.c index edb94663c5df0da41492f6a0f7061d3b8bdd4bfa..a00fc493f5cf1fc1c52126325bb708658494db83 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3518,10 +3518,6 @@ ssize_t generic_perform_write(struct file *file, * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. */ if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; @@ -3548,24 +3544,22 @@ ssize_t generic_perform_write(struct file *file, page, fsdata); if (unlikely(status < 0)) break; - copied = status; cond_resched(); - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { + if (unlikely(status == 0)) { /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. + * A short copy made ->write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); + if (copied) + bytes = copied; goto again; } + copied = status; + iov_iter_advance(i, copied); pos += copied; written += copied; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bfe079e294cb7cd24764f9782c4fa49d41958050..79c855b5adada38b20008bd739f272259b8a25f3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -401,7 +401,7 @@ static int __init hugepage_init(void) */ if (enable_dhugetlb) { transparent_hugepage_flags = 0; - pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + pr_info("transparent hugepage is disabled due to conflict with dynamic hugetlb\n"); return -EINVAL; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c815386ecc59a9a962acfaa9b0547bf4cb98b3..c5168c7f282af0e6890cde514a4aa9e6a0c46d17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1448,7 +1448,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - if (alloc_huge_page_vmemmap(h, page)) { + if (hugetlb_vmemmap_alloc(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1519,7 +1519,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { - if (free_vmemmap_pages_per_hpage(h)) + if (hugetlb_optimize_vmemmap_pages(h)) flush_work(&free_hpage_work); } @@ -1642,7 +1642,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { - free_huge_page_vmemmap(h, page); + hugetlb_vmemmap_free(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -2066,7 +2066,7 @@ int dissolve_free_huge_page(struct page *page) * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = alloc_huge_page_vmemmap(h, head); + rc = hugetlb_vmemmap_alloc(h, head); if (!rc) { /* * Move PageHWPoison flag from head page to the raw diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c540c21e26f5bb3376d387b219f3ea0d81cb648d..7ec8560d267d7f205ea16f112c504b30da727233 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -124,9 +124,9 @@ * page of page structs (page 0) associated with the HugeTLB page contains the 4 * page structs necessary to describe the HugeTLB. The only use of the remaining * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs * will be used for each HugeTLB page. This will allow us to free the remaining - * 6 pages to the buddy allocator. + * 7 pages to the buddy allocator. * * Here is how things look after remapping. * @@ -134,30 +134,30 @@ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ * | | | 0 | -------------> | 0 | * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | ----------------^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | - * | | | 3 | ------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | --------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | ----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | ------------------------+ | - * | | +-----------+ | - * | | | 7 | --------------------------+ + * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | | + * | | | 2 | -----------------+ | | | | | + * | | +-----------+ | | | | | + * | | | 3 | -------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | ---------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | -----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | -------------------------+ | + * | | +-----------+ | + * | | | 7 | ---------------------------+ * | | +-----------+ * | | * | | * | | * +-----------+ * - * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for * vmemmap pages and restore the previous mapping relationship. * * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. * * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures * (e.g. aarch64) provides a contiguous bit in the translation table entries @@ -166,67 +166,86 @@ * * The contiguous bit is used to increase the mapping size at the pmd and pte * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 2 pages. + * size of the struct page structs is greater than 1 page. + * + * Notice: The head vmemmap page is not freed to the buddy allocator and all + * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see + * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) + * associated with each HugeTLB page. The compound_head() can handle this + * correctly (more details refer to the comment above compound_head()). */ #define pr_fmt(fmt) "HugeTLB: " fmt +#include #include "hugetlb_vmemmap.h" /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first - * page of tail page structures. We map the virtual addresses of the remaining - * pages of tail page structures to the first tail page struct, and then free - * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + * page of head page structures. We map the virtual addresses of all the pages + * of tail page structures to the head page struct, and then free these page + * frames. Therefore, we need to reserve one pages as vmemmap areas. */ -#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +enum vmemmap_optimize_mode { + VMEMMAP_OPTIMIZE_OFF, + VMEMMAP_OPTIMIZE_ON, +}; -static int __init early_hugetlb_free_vmemmap_param(char *buf) -{ - /* We cannot optimize if a "struct page" crosses page boundaries. */ - if ((!is_power_of_2(sizeof(struct page)))) { - pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); - return 0; - } +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, + hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); - if (!buf) - return -EINVAL; +static enum vmemmap_optimize_mode vmemmap_optimize_mode = + IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); - if (!strcmp(buf, "on")) - hugetlb_free_vmemmap_enabled = true; - else if (!strcmp(buf, "off")) - hugetlb_free_vmemmap_enabled = false; - else - return -EINVAL; +static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +{ + if (vmemmap_optimize_mode == to) + return; - return 0; + if (to == VMEMMAP_OPTIMIZE_OFF) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else + static_branch_inc(&hugetlb_optimize_vmemmap_key); + WRITE_ONCE(vmemmap_optimize_mode, to); } -early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param); -static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) +static int __init hugetlb_vmemmap_early_param(char *buf) { - return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; + bool enable; + enum vmemmap_optimize_mode mode; + + if (kstrtobool(buf, &enable)) + return -EINVAL; + + mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; + vmemmap_optimize_mode_switch(mode); + + return 0; } +early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); /* * Previously discarded vmemmap pages will be allocated and remapping * after this function returns zero. */ -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { int ret; unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + /* * The pages which the vmemmap virtual address range [@vmemmap_addr, * @vmemmap_end) are mapped to are freed to the buddy allocator, and @@ -236,31 +255,40 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - - if (!ret) + if (!ret) { ClearHPageVmemmapOptimized(head); + static_branch_dec(&hugetlb_optimize_vmemmap_key); + } return ret; } -void free_huge_page_vmemmap(struct hstate *h, struct page *head) +void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + if (!vmemmap_pages) + return; - if (!free_vmemmap_pages_per_hpage(h)) + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) return; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + static_branch_inc(&hugetlb_optimize_vmemmap_key); + + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to, then free the pages * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. */ - if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else SetHPageVmemmapOptimized(head); } @@ -271,28 +299,87 @@ void __init hugetlb_vmemmap_init(struct hstate *h) /* * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, + * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. */ BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled) + if (enable_dhugetlb) { + pr_warn_once("cannot optimize vmemmap pages due to conflict with dynamic hugetlb\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + + if (!is_power_of_2(sizeof(struct page))) { + pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); return; + } vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* - * The head page and the first tail page are not to be freed to buddy - * allocator, the other pages will map to the first tail page, so they - * can be freed. + * The head page is not to be freed to buddy allocator, the other tail + * pages will map to the head page, so they can be freed. * * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true * on some architectures (e.g. aarch64). See Documentation/arm64/ * hugetlbpage.rst for more details. */ if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + + pr_info("can optimize %d vmemmap pages for %s\n", + h->optimize_vmemmap_pages, h->name); +} + +#ifdef CONFIG_PROC_SYSCTL +static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + enum vmemmap_optimize_mode mode; + static DEFINE_MUTEX(sysctl_mutex); - pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, - h->name); + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sysctl_mutex); + mode = vmemmap_optimize_mode; + table->data = &mode; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (write && !ret) + vmemmap_optimize_mode_switch(mode); + mutex_unlock(&sysctl_mutex); + + return ret; +} + +static struct ctl_table hugetlb_vmemmap_sysctls[] = { + { + .procname = "hugetlb_optimize_vmemmap", + .maxlen = sizeof(enum vmemmap_optimize_mode), + .mode = 0644, + .proc_handler = hugetlb_optimize_vmemmap_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int hugetlb_vmemmap_sysctls_init(void) +{ + /* + * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" + * crosses page boundaries, the vmemmap pages cannot be optimized. + * If "dynamic hugetlb" is enabled, the vmemmap pages cannot be + * optimized. + */ + if (is_power_of_2(sizeof(struct page)) && !enable_dhugetlb) + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + + return 0; } +late_initcall(hugetlb_vmemmap_sysctls_init); +#endif /* CONFIG_PROC_SYSCTL */ diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index cb2bef8f9e736a531fc6da1d9a4e9492346f6eef..109b0a53b6fe9b6e79694ab7830afa0cd2e03343 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -10,26 +10,26 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); -void free_huge_page_vmemmap(struct hstate *h, struct page *head); +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); +void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); /* - * How many vmemmap pages associated with a HugeTLB page that can be freed - * to the buddy allocator. + * How many vmemmap pages associated with a HugeTLB page that can be + * optimized and freed to the buddy allocator. */ -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { - return h->nr_free_vmemmap_pages; + return h->optimize_vmemmap_pages; } #else -static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { return 0; } -static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { } @@ -37,9 +37,9 @@ static inline void hugetlb_vmemmap_init(struct hstate *h) { } -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bfa6d1478a7588133d4eeaad749062859ce273e8..97a00a8e6f79e277bc383e96d1281417d7c3a6da 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -554,6 +555,150 @@ void collect_procs(struct page *page, struct list_head *tokill, } EXPORT_SYMBOL_GPL(collect_procs); +struct hwp_walk { + struct to_kill tk; + unsigned long pfn; + int flags; +}; + +static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift) +{ + tk->addr = addr; + tk->size_shift = shift; +} + +static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, + unsigned long poisoned_pfn, struct to_kill *tk) +{ + unsigned long pfn = 0; + + if (pte_present(pte)) { + pfn = pte_pfn(pte); + } else { + swp_entry_t swp = pte_to_swp_entry(pte); + + if (is_hwpoison_entry(swp)) + pfn = hwpoison_entry_to_pfn(swp); + } + + if (!pfn || pfn != poisoned_pfn) + return 0; + + set_to_kill(tk, addr, shift); + return 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + pmd_t pmd = *pmdp; + unsigned long pfn; + unsigned long hwpoison_vaddr; + + if (!pmd_present(pmd)) + return 0; + pfn = pmd_pfn(pmd); + if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) { + hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT); + set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT); + return 1; + } + return 0; +} +#else +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + return 0; +} +#endif + +static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + int ret = 0; + pte_t *ptep; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmdp, walk->vma); + if (ptl) { + ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmdp)) + goto out; + + ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { + ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + hwp->pfn, &hwp->tk); + if (ret == 1) + break; + } + pte_unmap_unlock(ptep - 1, ptl); +out: + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + pte_t pte = huge_ptep_get(ptep); + struct hstate *h = hstate_vma(walk->vma); + + return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); +} +#else +#define hwpoison_hugetlb_range NULL +#endif + +static struct mm_walk_ops hwp_walk_ops = { + .pmd_entry = hwpoison_pte_range, + .hugetlb_entry = hwpoison_hugetlb_range, +}; + +/* + * Sends SIGBUS to the current process with error info. + * + * This function is intended to handle "Action Required" MCEs on already + * hardware poisoned pages. They could happen, for example, when + * memory_failure() failed to unmap the error page at the first call, or + * when multiple local machine checks happened on different CPUs. + * + * MCE handler currently has no easy access to the error virtual address, + * so this function walks page table to find it. The returned virtual address + * is proper in most cases, but it could be wrong when the application + * process has multiple entries mapping the error page. + */ +static int kill_accessing_process(struct task_struct *p, unsigned long pfn, + int flags) +{ + int ret; + struct hwp_walk priv = { + .pfn = pfn, + }; + priv.tk.tsk = p; + + mmap_read_lock(p->mm); + ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, + (void *)&priv); + if (ret == 1 && priv.tk.addr) + kill_proc(&priv.tk, pfn, flags); + else + ret = 0; + mmap_read_unlock(p->mm); + return ret > 0 ? -EHWPOISON : -EFAULT; +} + static const char *action_name[] = { [MF_IGNORED] = "Ignored", [MF_FAILED] = "Failed", @@ -658,6 +803,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, */ static int me_kernel(struct page *p, unsigned long pfn) { + unlock_page(p); return MF_IGNORED; } @@ -667,6 +813,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + unlock_page(p); return MF_FAILED; } @@ -675,6 +822,7 @@ static int me_unknown(struct page *p, unsigned long pfn) */ static int me_pagecache_clean(struct page *p, unsigned long pfn) { + int ret; struct address_space *mapping; delete_from_lru_cache(p); @@ -683,8 +831,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * For anonymous pages we're done the only reference left * should be the one m_f() holds. */ - if (PageAnon(p)) - return MF_RECOVERED; + if (PageAnon(p)) { + ret = MF_RECOVERED; + goto out; + } /* * Now truncate the page in the page cache. This is really @@ -698,7 +848,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return MF_FAILED; + ret = MF_FAILED; + goto out; } /* @@ -706,7 +857,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - return truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, pfn, mapping); +out: + unlock_page(p); + return ret; } /* @@ -782,24 +936,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) */ static int me_swapcache_dirty(struct page *p, unsigned long pfn) { + int ret; + ClearPageDirty(p); /* Trigger EIO in shmem: */ ClearPageUptodate(p); - if (!delete_from_lru_cache(p)) - return MF_DELAYED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; + unlock_page(p); + return ret; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { + int ret; + delete_from_swap_cache(p); - if (!delete_from_lru_cache(p)) - return MF_RECOVERED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; + unlock_page(p); + return ret; } /* @@ -820,6 +976,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, pfn, mapping); + unlock_page(hpage); } else { res = MF_FAILED; unlock_page(hpage); @@ -834,7 +991,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) page_ref_inc(p); res = MF_RECOVERED; } - lock_page(hpage); } return res; @@ -866,6 +1022,8 @@ static struct page_state { unsigned long mask; unsigned long res; enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ int (*action)(struct page *p, unsigned long pfn); } error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, @@ -929,6 +1087,7 @@ static int page_action(struct page_state *ps, struct page *p, int result; int count; + /* page p should be unlocked after returning from ps->action(). */ result = ps->action(p, pfn); count = page_count(p) - 1; @@ -1190,7 +1349,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestSetPageHWPoison(head)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return -EHWPOISON; + res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, page_to_pfn(head), flags); + return res; } num_poisoned_pages_inc(); @@ -1246,7 +1408,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) goto out; } - res = identify_page_state(pfn, p, page_flags); + return identify_page_state(pfn, p, page_flags); out: unlock_page(head); return res; @@ -1402,6 +1564,8 @@ int memory_failure(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, pfn, flags); goto unlock_mutex; } @@ -1536,6 +1700,8 @@ int memory_failure(unsigned long pfn, int flags) identify_page_state: res = identify_page_state(pfn, p, page_flags); + mutex_unlock(&mf_mutex); + return res; unlock_page: unlock_page(p); unlock_mutex: diff --git a/mm/mmap.c b/mm/mmap.c index 5ad32537604a5bf17943856ef0e72d4190d4d8a8..5489d70db84e35018de8b7c0f8cf22d1c3bea459 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2404,14 +2404,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 07f42a7a60657adf069483c6981c642d67ba78f1..9165ca619c8cfdb282ac33db4b5d5e0246474d2c 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1043,6 +1043,18 @@ int mmu_interval_notifier_insert_locked( } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); +static bool +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, + unsigned long seq) +{ + bool ret; + + spin_lock(&subscriptions->lock); + ret = subscriptions->invalidate_seq != seq; + spin_unlock(&subscriptions->lock); + return ret; +} + /** * mmu_interval_notifier_remove - Remove a interval notifier * @interval_sub: Interval subscription to unregister @@ -1090,7 +1102,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) wait_event(subscriptions->wq, - READ_ONCE(subscriptions->invalidate_seq) != seq); + mmu_interval_seq_released(subscriptions, seq)); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ec73cca1726c533b8cd1af0be572586bbff8ef0c..cf9c69d631f3d8e541b944a72aca06bc52e6896a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7974,7 +7974,7 @@ void __init mem_init_print_info(const char *str) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) diff --git a/mm/ptdump.c b/mm/ptdump.c index 93f2f63dc52dc3d31a02f683cd5d3ec5c375c528..43661863096b1db901d5fecceeccfc11968a27ab 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -39,8 +39,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val)); - if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -59,8 +61,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val)); - if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -79,8 +83,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val)); - if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -98,8 +104,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 396a49462894a5a4771618c1f8a2ca732a5cf783..5b40a7473dc8a61073275e4c1d433b9b85565e2d 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,6 +34,7 @@ #include #include +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -53,8 +54,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +76,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in __pte_alloc(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in __pte_alloc(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); return 0; } +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +140,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -245,6 +263,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, entry); } +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { @@ -258,6 +296,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } @@ -300,10 +339,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* @@ -383,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 99712d35e5351e72f5e46caac6ba141a9d16221d..f266a9453c8e5b563b59a9424836d22bc892a985 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -398,6 +398,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + if (!skb->len) + return -EINVAL; + if (!__skb) return 0; diff --git a/net/can/isotp.c b/net/can/isotp.c index 9a4a9c5a9f24c3fe24debcdc5d083db131354a42..c515bbd46c6792c8425fa4574d916b77cb9c2945 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -864,6 +864,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; + s64 hrtimer_sec = 0; int off; int err; @@ -962,7 +963,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) isotp_create_fframe(cf, so, ae); /* start timeout for FC */ - hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + hrtimer_sec = 1; + hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), + HRTIMER_MODE_REL_SOFT); } /* send the first or only CAN frame */ @@ -975,6 +978,11 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %d\n", __func__, err); + + /* no transmission -> no timeout monitoring */ + if (hrtimer_sec) + hrtimer_cancel(&so->txtimer); + goto err_out_drop; } diff --git a/net/core/dev.c b/net/core/dev.c index 12089c484b304b25be98bebb8922c79c6098671d..8e4de36eede8effca1f9f5a2382a2ef7a6a436ca 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4094,6 +4094,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) bool again = false; skb_reset_mac_header(skb); + skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9aae82145bc16d957f1ca6b98e3bbe410de3b56c..20d738137841892cd64a0909b4c1fefc00a6a484 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -448,7 +448,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -458,8 +457,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 20c7bef6829e1fc89188545c9f4d7f33c7937a0b..cb28f8928f9ee50bf3910ef0b1c69b8dbd8f89be 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -483,7 +483,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -492,8 +491,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 9a0263f2523237058ec316d4cf007a9917460681..1f6c752f13b40ead177f1cfa662d29fd0141e5e1 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -733,9 +733,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) - return -ENOMEM; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -743,6 +740,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; __be16 flags; + int tun_hlen; tun_info = skb_tunnel_info_txcheck(skb); if (IS_ERR(tun_info) || @@ -760,9 +758,12 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, dsfield = key->tos; flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); - tunnel->tun_hlen = gre_calc_hlen(flags); + tun_hlen = gre_calc_hlen(flags); - gre_build_header(skb, tunnel->tun_hlen, + if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) + return -ENOMEM; + + gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) @@ -772,6 +773,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, if (tunnel->parms.o_flags & TUNNEL_SEQ) tunnel->o_seqno++; + if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) + return -ENOMEM; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a44ad9637e8acbc0f76033a678fd19b8bf745d33..4ef59dc515e594e50cd466f9826f4b8e90248363 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3187,6 +3187,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); @@ -3197,13 +3198,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; } @@ -6358,7 +6359,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 864326f150e2fcf76f9022c8ad5bb2cb0ac08a7c..f2c3a61ad134b0b1af2a8e079e0a068c5af6c76e 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -147,7 +147,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) dev = dev_get_by_index_rcu(net, ifindex); while (dev && !netif_is_l3_master(dev)) - dev = netdev_master_upper_dev_get(dev); + dev = netdev_master_upper_dev_get_rcu(dev); return dev ? dev->ifindex : 0; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f37916156ca523556c0266ad50acc2e9036ba0a1..cbfb601c4ee9802ae5133fcc90a20c11298f73c9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2276,6 +2276,13 @@ static int netlink_dump(struct sock *sk) * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); + + /* Make sure malicious BPF programs can not read unitialized memory + * from skb->head -> skb->data + */ + skb_reset_network_header(skb); + skb_reset_mac_header(skb); + netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 98a7e6f64ab0b07894db02030a11cb0535191747..293a798e89f42b551bc1edaa75f91ce85d88a126 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2436,7 +2436,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) { OVS_NLERR(log, "Flow action size exceeds max %u", MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d0c95d7dd292d89f0de3b36816ff927e0ffdacfb..5ee600d108a0a2f930f35e19448dff1e6aa64422 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2817,8 +2817,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) status = TP_STATUS_SEND_REQUEST; err = po->xmit(skb); - if (unlikely(err > 0)) { - err = net_xmit_errno(err); + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ @@ -3019,8 +3020,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->no_fcs = 1; err = po->xmit(skb); - if (err > 0 && (err = net_xmit_errno(err)) != 0) - goto out_unlock; + if (unlikely(err != 0)) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto out_unlock; + } dev_put(dev); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index f15d6942da45306e4fa15399473044281dcbfed9..cc7e30733feb0d3f381269dfc4b9bcd5532b42ec 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -113,7 +113,9 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; + del_timer_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); + /* Remove the timer again as the worker may have restarted it. */ del_timer_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index b61db335c49d15cb357e8f27eb2f2f6fdf17e014..da042bc8b239dd312bec797abe480d2d49148f67 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -814,10 +814,6 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, ht); - /* bump reference count as long as we hold pointer to structure */ - if (ht) - ht->refcnt++; - #ifdef CONFIG_CLS_U32_PERF /* Statistics may be incremented by readers during update * so we must keep them in tact. When the node is later destroyed @@ -839,6 +835,10 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, return NULL; } + /* bump reference count as long as we hold pointer to structure */ + if (ht) + ht->refcnt++; + return new; } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4f16d406ad8ea9e1716ee26ca826c13c2cf876df..1b98f3241150b605cd07789f54245f8fd6f7f3bc 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2144,8 +2144,10 @@ static int smc_shutdown(struct socket *sock, int how) if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sock_put(sk); + } goto out; } switch (how) { diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 5a06050729a74d4288143b07c50492ef61cc8ab5..b1ab4b3d99fbe753169d95e7b2fe0ff29a22421c 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1900,6 +1900,10 @@ bool ima_appraise_signature(enum kernel_read_file_id id) if (id >= READING_MAX_ID) return false; + if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE) + && security_locked_down(LOCKDOWN_KEXEC)) + return false; + func = read_idmap[id] ?: FILE_CHECK; rcu_read_lock(); diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 11d653190e6eac3a2cd5c1b171af1cb08b0363b7..b5168959fcf636e851b6a013d12c4a43c67d5cf7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8897,6 +8897,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x8562, "Clevo NH[5|7][0-9]RZ[Q]", ALC269_FIXUP_DMIC), SND_PCI_QUIRK(0x1558, 0x8668, "Clevo NP50B[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x866d, "Clevo NP5[05]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x867c, "Clevo NP7[01]PNP", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x867d, "Clevo NP7[01]PN[HJK]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8680, "Clevo NJ50LU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x8686, "Clevo NH50[CZ]U", ALC256_FIXUP_MIC_NO_PRESENCE_AND_RESUME), diff --git a/sound/soc/atmel/sam9g20_wm8731.c b/sound/soc/atmel/sam9g20_wm8731.c index 8a55d59a6c2aa97bc7680287479d0a1c9dd2859d..d243de5f23dc1c4fae8ad88566072ab28b88e803 100644 --- a/sound/soc/atmel/sam9g20_wm8731.c +++ b/sound/soc/atmel/sam9g20_wm8731.c @@ -46,35 +46,6 @@ */ #undef ENABLE_MIC_INPUT -static struct clk *mclk; - -static int at91sam9g20ek_set_bias_level(struct snd_soc_card *card, - struct snd_soc_dapm_context *dapm, - enum snd_soc_bias_level level) -{ - static int mclk_on; - int ret = 0; - - switch (level) { - case SND_SOC_BIAS_ON: - case SND_SOC_BIAS_PREPARE: - if (!mclk_on) - ret = clk_enable(mclk); - if (ret == 0) - mclk_on = 1; - break; - - case SND_SOC_BIAS_OFF: - case SND_SOC_BIAS_STANDBY: - if (mclk_on) - clk_disable(mclk); - mclk_on = 0; - break; - } - - return ret; -} - static const struct snd_soc_dapm_widget at91sam9g20ek_dapm_widgets[] = { SND_SOC_DAPM_MIC("Int Mic", NULL), SND_SOC_DAPM_SPK("Ext Spk", NULL), @@ -135,7 +106,6 @@ static struct snd_soc_card snd_soc_at91sam9g20ek = { .owner = THIS_MODULE, .dai_link = &at91sam9g20ek_dai, .num_links = 1, - .set_bias_level = at91sam9g20ek_set_bias_level, .dapm_widgets = at91sam9g20ek_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(at91sam9g20ek_dapm_widgets), @@ -148,7 +118,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct device_node *codec_np, *cpu_np; - struct clk *pllb; struct snd_soc_card *card = &snd_soc_at91sam9g20ek; int ret; @@ -162,31 +131,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) return -EINVAL; } - /* - * Codec MCLK is supplied by PCK0 - set it up. - */ - mclk = clk_get(NULL, "pck0"); - if (IS_ERR(mclk)) { - dev_err(&pdev->dev, "Failed to get MCLK\n"); - ret = PTR_ERR(mclk); - goto err; - } - - pllb = clk_get(NULL, "pllb"); - if (IS_ERR(pllb)) { - dev_err(&pdev->dev, "Failed to get PLLB\n"); - ret = PTR_ERR(pllb); - goto err_mclk; - } - ret = clk_set_parent(mclk, pllb); - clk_put(pllb); - if (ret != 0) { - dev_err(&pdev->dev, "Failed to set MCLK parent\n"); - goto err_mclk; - } - - clk_set_rate(mclk, MCLK_RATE); - card->dev = &pdev->dev; /* Parse device node info */ @@ -230,9 +174,6 @@ static int at91sam9g20ek_audio_probe(struct platform_device *pdev) return ret; -err_mclk: - clk_put(mclk); - mclk = NULL; err: atmel_ssc_put_audio(0); return ret; @@ -242,8 +183,6 @@ static int at91sam9g20ek_audio_remove(struct platform_device *pdev) { struct snd_soc_card *card = platform_get_drvdata(pdev); - clk_disable(mclk); - mclk = NULL; snd_soc_unregister_card(card); atmel_ssc_put_audio(0); diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c index 9ad7fc0baf072678b40063b96fa8450738b06d70..20a07c92b2fc29d5749c21453f04d0020bc823ff 100644 --- a/sound/soc/codecs/msm8916-wcd-digital.c +++ b/sound/soc/codecs/msm8916-wcd-digital.c @@ -1206,9 +1206,16 @@ static int msm8916_wcd_digital_probe(struct platform_device *pdev) dev_set_drvdata(dev, priv); - return devm_snd_soc_register_component(dev, &msm8916_wcd_digital, + ret = devm_snd_soc_register_component(dev, &msm8916_wcd_digital, msm8916_wcd_digital_dai, ARRAY_SIZE(msm8916_wcd_digital_dai)); + if (ret) + goto err_mclk; + + return 0; + +err_mclk: + clk_disable_unprepare(priv->mclk); err_clk: clk_disable_unprepare(priv->ahbclk); return ret; diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c index 8540ac230d0eda3115736cc608789c4412d1ea7f..fd704df9b1758395afd5833ccff7abdf3adaedc1 100644 --- a/sound/soc/codecs/wcd934x.c +++ b/sound/soc/codecs/wcd934x.c @@ -1188,29 +1188,7 @@ static int wcd934x_set_sido_input_src(struct wcd934x_codec *wcd, int sido_src) if (sido_src == wcd->sido_input_src) return 0; - if (sido_src == SIDO_SOURCE_INTERNAL) { - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, 0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_PRE_ENX_MASK, 0x0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO, - WCD934X_ANA_RCO_BG_EN_MASK, 0); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_PRE_EN1_MASK, - WCD934X_ANA_BUCK_PRE_EN1_ENABLE); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_PRE_EN2_MASK, - WCD934X_ANA_BUCK_PRE_EN2_ENABLE); - usleep_range(100, 110); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BUCK_CTL, - WCD934X_ANA_BUCK_HI_ACCU_EN_MASK, - WCD934X_ANA_BUCK_HI_ACCU_ENABLE); - usleep_range(100, 110); - } else if (sido_src == SIDO_SOURCE_RCO_BG) { + if (sido_src == SIDO_SOURCE_RCO_BG) { regmap_update_bits(wcd->regmap, WCD934X_ANA_RCO, WCD934X_ANA_RCO_BG_EN_MASK, WCD934X_ANA_RCO_BG_ENABLE); @@ -1296,8 +1274,6 @@ static int wcd934x_disable_ana_bias_and_syclk(struct wcd934x_codec *wcd) regmap_update_bits(wcd->regmap, WCD934X_CLK_SYS_MCLK_PRG, WCD934X_EXT_CLK_BUF_EN_MASK | WCD934X_MCLK_EN_MASK, 0x0); - wcd934x_set_sido_input_src(wcd, SIDO_SOURCE_INTERNAL); - regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS, WCD934X_ANA_BIAS_EN_MASK, 0); regmap_update_bits(wcd->regmap, WCD934X_ANA_BIAS, diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 2924d89bf0daf109730e49e5169fc9ccfc8ed264..417732bdf286003dc929e7478e1008bdad0bcda8 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1683,8 +1683,7 @@ static void dapm_seq_run(struct snd_soc_card *card, switch (w->id) { case snd_soc_dapm_pre: if (!w->event) - list_for_each_entry_safe_continue(w, n, list, - power_list); + continue; if (event == SND_SOC_DAPM_STREAM_START) ret = w->event(w, @@ -1696,8 +1695,7 @@ static void dapm_seq_run(struct snd_soc_card *card, case snd_soc_dapm_post: if (!w->event) - list_for_each_entry_safe_continue(w, n, list, - power_list); + continue; if (event == SND_SOC_DAPM_STREAM_START) ret = w->event(w, diff --git a/sound/usb/midi.c b/sound/usb/midi.c index fa91290ad89db0d9121a691aedad917b85d0a012..84676a8fb60dcfc7798597470a6233e23e36611f 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1210,6 +1210,7 @@ static void snd_usbmidi_output_drain(struct snd_rawmidi_substream *substream) } while (drain_urbs && timeout); finish_wait(&ep->drain_wait, &wait); } + port->active = 0; spin_unlock_irq(&ep->buffer_lock); } diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index e54a98f4654902e97edf8876aec31a0cb1d64b68..d8e31ee03b9d056f0f771970d8d09983ff544122 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -8,7 +8,7 @@ */ /* handling of USB vendor/product ID pairs as 32-bit numbers */ -#define USB_ID(vendor, product) (((vendor) << 16) | (product)) +#define USB_ID(vendor, product) (((unsigned int)(vendor) << 16) | (product)) #define USB_ID_VENDOR(id) ((id) >> 16) #define USB_ID_PRODUCT(id) ((u16)(id)) diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 17465d454a0e31d912f7d0782d3f0bc19b6bfcdd..f76b1a9d5a6e18bb315b4405068307ec5e3d91a9 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -571,7 +571,6 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, { struct perf_evsel *evsel; const struct perf_cpu_map *cpus = evlist->cpus; - const struct perf_thread_map *threads = evlist->threads; if (!ops || !ops->get || !ops->mmap) return -EINVAL; @@ -583,7 +582,7 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, perf_evlist__for_each_entry(evlist, evsel) { if ((evsel->attr.read_format & PERF_FORMAT_ID) && evsel->sample_id == NULL && - perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0) + perf_evsel__alloc_id(evsel, evsel->fd->max_x, evsel->fd->max_y) < 0) return -ENOMEM; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 91cab5cdfbc16ff5b7a660dfc2018c13e4dc522f..b55ee073c2f722c6b607e216d3c2f23b042fd4d9 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -340,6 +340,7 @@ static int report__setup_sample_type(struct report *rep) struct perf_session *session = rep->session; u64 sample_type = evlist__combined_sample_type(session->evlist); bool is_pipe = perf_data__is_pipe(session->data); + struct evsel *evsel; if (session->itrace_synth_opts->callchain || session->itrace_synth_opts->add_callchain || @@ -394,6 +395,19 @@ static int report__setup_sample_type(struct report *rep) } if (sort__mode == SORT_MODE__MEMORY) { + /* + * FIXUP: prior to kernel 5.18, Arm SPE missed to set + * PERF_SAMPLE_DATA_SRC bit in sample type. For backward + * compatibility, set the bit if it's an old perf data file. + */ + evlist__for_each_entry(session->evlist, evsel) { + if (strstr(evsel->name, "arm_spe") && + !(sample_type & PERF_SAMPLE_DATA_SRC)) { + evsel->core.attr.sample_type |= PERF_SAMPLE_DATA_SRC; + sample_type |= PERF_SAMPLE_DATA_SRC; + } + } + if (!is_pipe && !(sample_type & PERF_SAMPLE_DATA_SRC)) { ui__error("Selected --mem-mode but no mem data. " "Did you call perf record without -d?\n"); diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh index fedcb7b35af9f3f2f412ba6a1cadb56b3e35d71d..af5ea50ed5c0ecac41b22d844979f1df59ca106a 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh @@ -172,6 +172,17 @@ flooding_filters_add() local lsb local i + # Prevent unwanted packets from entering the bridge and interfering + # with the test. + tc qdisc add dev br0 clsact + tc filter add dev br0 egress protocol all pref 1 handle 1 \ + matchall skip_hw action drop + tc qdisc add dev $h1 clsact + tc filter add dev $h1 egress protocol all pref 1 handle 1 \ + flower skip_hw dst_mac de:ad:be:ef:13:37 action pass + tc filter add dev $h1 egress protocol all pref 2 handle 2 \ + matchall skip_hw action drop + tc qdisc add dev $rp2 clsact for i in $(eval echo {1..$num_remotes}); do @@ -194,6 +205,12 @@ flooding_filters_del() done tc qdisc del dev $rp2 clsact + + tc filter del dev $h1 egress protocol all pref 2 handle 2 matchall + tc filter del dev $h1 egress protocol all pref 1 handle 1 flower + tc qdisc del dev $h1 clsact + tc filter del dev br0 egress protocol all pref 1 handle 1 matchall + tc qdisc del dev br0 clsact } flooding_check_packets() diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index b3a183c36cb53c7c7addd81f2b3fda533fed8fb8..905dc4efa87901b46e7f214a7e7d9326cf70e8ae 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap hugepage-shm +hugepage-vmemmap khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index b150cc837177a1227dc036f9c2fd1fa552a2d4cc..549761bc7193b106f3544b4068bd519fa0402e01 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -27,6 +27,7 @@ TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c new file mode 100644 index 0000000000000000000000000000000000000000..557bdbd4f87e8684530abee51a633e0470f50829 --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include +#include +#include +#include +#include + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index d578ad83181370c0c6c95b378b1c5f12b073b812..949c71fe59519c57b234a77b1019f40fdae9c895 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -108,6 +108,17 @@ else echo "[PASS]" fi +echo "------------------------" +echo "running hugepage-vmemmap" +echo "------------------------" +./hugepage-vmemmap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing."