diff --git a/Documentation/ABI/testing/sysfs-class-intel_pmt b/Documentation/ABI/testing/sysfs-class-intel_pmt new file mode 100644 index 0000000000000000000000000000000000000000..ed4c886a21b1ee1640d21f11e5347fa06a5b95b6 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-intel_pmt @@ -0,0 +1,119 @@ +What: /sys/class/intel_pmt/ +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + The intel_pmt/ class directory contains information for + devices that expose hardware telemetry using Intel Platform + Monitoring Technology (PMT) + +What: /sys/class/intel_pmt/telem +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + The telem directory contains files describing an instance of + a PMT telemetry device that exposes hardware telemetry. Each + telem directory has an associated telem file. This file + may be opened and mapped or read to access the telemetry space + of the device. The register layout of the telemetry space is + determined from an XML file that matches the PCI device id and + GUID for the device. + +What: /sys/class/intel_pmt/telem/telem +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The telemetry data for this telemetry device. This file + may be mapped or read to obtain the data. + +What: /sys/class/intel_pmt/telem/guid +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The GUID for this telemetry device. The GUID identifies + the version of the XML file for the parent device that is to + be used to get the register layout. + +What: /sys/class/intel_pmt/telem/size +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The size of telemetry region in bytes that corresponds to + the mapping size for the telem file. + +What: /sys/class/intel_pmt/telem/offset +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The offset of telemetry region in bytes that corresponds to + the mapping for the telem file. + +What: /sys/class/intel_pmt/crashlog +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + The crashlog directory contains files for configuring an + instance of a PMT crashlog device that can perform crash data + recording. Each crashlog device has an associated crashlog + file. This file can be opened and mapped or read to access the + resulting crashlog buffer. The register layout for the buffer + can be determined from an XML file of specified GUID for the + parent device. + +What: /sys/class/intel_pmt/crashlog/crashlog +Date: October 2020 +KernelVersion: 5.10 +Contact: David Box +Description: + (RO) The crashlog buffer for this crashlog device. This file + may be mapped or read to obtain the data. + +What: /sys/class/intel_pmt/crashlog/guid +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RO) The GUID for this crashlog device. The GUID identifies the + version of the XML file for the parent device that should be + used to determine the register layout. + +What: /sys/class/intel_pmt/crashlog/size +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RO) The length of the result buffer in bytes that corresponds + to the size for the crashlog buffer. + +What: /sys/class/intel_pmt/crashlog/offset +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RO) The offset of the buffer in bytes that corresponds + to the mapping for the crashlog device. + +What: /sys/class/intel_pmt/crashlog/enable +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RW) Boolean value controlling if the crashlog functionality + is enabled for the crashlog device. + +What: /sys/class/intel_pmt/crashlog/trigger +Date: October 2020 +KernelVersion: 5.10 +Contact: Alexander Duyck +Description: + (RW) Boolean value controlling the triggering of the crashlog + device node. When read it provides data on if the crashlog has + been triggered. When written to it can be used to either clear + the current trigger by writing false, or to trigger a new + event if the trigger is not currently set. diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 56deaceab6254a07cd6fcc86080c85b3b8b73447..671bbd90f15be5338e2f6a2359755a44f257d879 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6197,6 +6197,29 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space can then handle to implement model specific MSR handling and/or user notifications to inform a user that an MSR was not handled. +7.25 KVM_CAP_SGX_ATTRIBUTE +---------------------- + +:Architectures: x86 +:Target: VM +:Parameters: args[0] is a file handle of a SGX attribute file in securityfs +:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested + attribute is not supported by KVM. + +KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or +more priveleged enclave attributes. args[0] must hold a file handle to a valid +SGX attribute file corresponding to an attribute that is supported/restricted +by KVM (currently only PROVISIONKEY). + +The SGX subsystem restricts access to a subset of enclave attributes to provide +additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY +is restricted to deter malware from using the PROVISIONKEY to obtain a stable +system fingerprint. To prevent userspace from circumventing such restrictions +by running an enclave in a VM, KVM prevents access to privileged attributes by +default. + +See Documentation/x86/sgx/2.Kernel-internals.rst for more details. + 8. Other capabilities. ====================== diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst index eaee1368b4fd8b838afba5d69d626fad1ec9e39d..a608f667fb9532bdf100e87745fd9737dcbbf1dd 100644 --- a/Documentation/x86/sgx.rst +++ b/Documentation/x86/sgx.rst @@ -209,3 +209,79 @@ An application may be loaded into a container enclave which is specially configured with a library OS and run-time which permits the application to run. The enclave run-time and library OS work together to execute the application when a thread enters the enclave. + +Impact of Potential Kernel SGX Bugs +=================================== + +EPC leaks +--------- + +When EPC page leaks happen, a WARNING like this is shown in dmesg: + +"EREMOVE returned ... and an EPC page was leaked. SGX may become unusable..." + +This is effectively a kernel use-after-free of an EPC page, and due +to the way SGX works, the bug is detected at freeing. Rather than +adding the page back to the pool of available EPC pages, the kernel +intentionally leaks the page to avoid additional errors in the future. + +When this happens, the kernel will likely soon leak more EPC pages, and +SGX will likely become unusable because the memory available to SGX is +limited. However, while this may be fatal to SGX, the rest of the kernel +is unlikely to be impacted and should continue to work. + +As a result, when this happpens, user should stop running any new +SGX workloads, (or just any new workloads), and migrate all valuable +workloads. Although a machine reboot can recover all EPC memory, the bug +should be reported to Linux developers. + + +Virtual EPC +=========== + +The implementation has also a virtual EPC driver to support SGX enclaves +in guests. Unlike the SGX driver, an EPC page allocated by the virtual +EPC driver doesn't have a specific enclave associated with it. This is +because KVM doesn't track how a guest uses EPC pages. + +As a result, the SGX core page reclaimer doesn't support reclaiming EPC +pages allocated to KVM guests through the virtual EPC driver. If the +user wants to deploy SGX applications both on the host and in guests +on the same machine, the user should reserve enough EPC (by taking out +total virtual EPC size of all SGX VMs from the physical EPC size) for +host SGX applications so they can run with acceptable performance. + +Architectural behavior is to restore all EPC pages to an uninitialized +state also after a guest reboot. Because this state can be reached only +through the privileged ``ENCLS[EREMOVE]`` instruction, ``/dev/sgx_vepc`` +provides the ``SGX_IOC_VEPC_REMOVE_ALL`` ioctl to execute the instruction +on all pages in the virtual EPC. + +``EREMOVE`` can fail for three reasons. Userspace must pay attention +to expected failures and handle them as follows: + +1. Page removal will always fail when any thread is running in the + enclave to which the page belongs. In this case the ioctl will + return ``EBUSY`` independent of whether it has successfully removed + some pages; userspace can avoid these failures by preventing execution + of any vcpu which maps the virtual EPC. + +2. Page removal will cause a general protection fault if two calls to + ``EREMOVE`` happen concurrently for pages that refer to the same + "SECS" metadata pages. This can happen if there are concurrent + invocations to ``SGX_IOC_VEPC_REMOVE_ALL``, or if a ``/dev/sgx_vepc`` + file descriptor in the guest is closed at the same time as + ``SGX_IOC_VEPC_REMOVE_ALL``; it will also be reported as ``EBUSY``. + This can be avoided in userspace by serializing calls to the ioctl() + and to close(), but in general it should not be a problem. + +3. Finally, page removal will fail for SECS metadata pages which still + have child pages. Child pages can be removed by executing + ``SGX_IOC_VEPC_REMOVE_ALL`` on all ``/dev/sgx_vepc`` file descriptors + mapped into the guest. This means that the ioctl() must be called + twice: an initial set of calls to remove child pages and a subsequent + set of calls to remove SECS pages. The second set of calls is only + required for those mappings that returned a nonzero value from the + first call. It indicates a bug in the kernel or the userspace client + if any of the second round of ``SGX_IOC_VEPC_REMOVE_ALL`` calls has + a return code other than 0. diff --git a/MAINTAINERS b/MAINTAINERS index 23a23bd94c0033d95460f96faf799dc6ff9209d4..466b1c599848abe2282e9780fb34059c86f89414 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9038,6 +9038,12 @@ F: drivers/mfd/intel_soc_pmic* F: include/linux/mfd/intel_msic.h F: include/linux/mfd/intel_soc_pmic* +INTEL PMT DRIVER +M: "David E. Box" +S: Maintained +F: drivers/mfd/intel_pmt.c +F: drivers/platform/x86/intel_pmt_* + INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT M: Stanislav Yakovlev L: linux-wireless@vger.kernel.org diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1e9b13636f17b58150e5bf1045a1b2d9a4589644..5fd7c396e5eb5d041dce809e6e9bad57e2acbf2b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -291,6 +291,8 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4ab8f866e39d779e47f598a3c3ba5272bcb9300d..cf3e3bb5a1873085e4009b062f878719ec24594b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -216,6 +216,7 @@ enum x86_intercept_stage; #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 @@ -225,6 +226,7 @@ enum x86_intercept_stage; #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) @@ -992,6 +994,9 @@ struct kvm_arch { bool bus_lock_detection_enabled; + /* Guest can access the SGX PROVISIONKEY. */ + bool sgx_provisioning_allowed; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index d96e5451d28a1059387a2ff416ae44a06dc98d13..a16e2c9154a31262582fdf8df955394de8fbb17c 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -27,16 +27,36 @@ /* The bitmask for the EPC section type. */ #define SGX_CPUID_EPC_MASK GENMASK(3, 0) +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, + EAUG = 0x0D, + EMODPR = 0x0E, + EMODT = 0x0F, +}; + /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. + * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { SGX_NOT_TRACKED = 11, + SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, SGX_UNMASKED_EVENT = 128, }; @@ -345,4 +365,14 @@ struct sgx_sigstruct { * comment! */ +#ifdef CONFIG_X86_SGX_KVM +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr); +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr); +#endif + +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd); + #endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 305bc1214aef96e170dfd2613447240eae2ce854..10b1de500ab1c298cfe9d092f6cc5435df3f0571 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -11,6 +11,7 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 15 == 1: SGX MMU page-fault */ enum x86_pf_error_code { X86_PF_PROT = 1 << 0, @@ -19,6 +20,7 @@ enum x86_pf_error_code { X86_PF_RSVD = 1 << 3, X86_PF_INSTR = 1 << 4, X86_PF_PK = 1 << 5, + X86_PF_SGX = 1 << 15, }; #endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f8ba5289ecb01ed24b149eb5fa5c0a120de7c5ae..c6f028bac3ff869e2e3a65a28575d663a39eda82 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -371,6 +371,7 @@ enum vmcs_field { #define GUEST_INTR_STATE_MOV_SS 0x00000002 #define GUEST_INTR_STATE_SMI 0x00000004 #define GUEST_INTR_STATE_NMI 0x00000008 +#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010 /* GUEST_ACTIVITY_STATE flags */ #define GUEST_ACTIVITY_ACTIVE 0 diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 791e45334a4a210c0ec6d2810980af454e549f86..c815a6fec9aaf9bd91722a4cb544c2432b3888b6 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -27,6 +27,8 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init) #define SGX_IOC_ENCLAVE_PROVISION \ _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) +#define SGX_IOC_VEPC_REMOVE_ALL \ + _IO(SGX_MAGIC, 0x04) /** * struct sgx_enclave_create - parameter structure for the diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b8ff9e8ac0d516b183eaf2fc64bcd0ddbaf3b15c..df6707a76a3d06424ac6102aa6dfbfa879136614 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -27,6 +27,7 @@ #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 +#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index d502241995a39df7f82ab16a723146d0eecb30d8..28c75066815c0db1089b80a9e9a9133999697d2a 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -71,6 +71,9 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, {} }; diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 3b1b01f2b248a57cd0dee06fbd35077e42ed1a76..da696eb4821a0b159e14ea0246beea0bcaf22cfc 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ -static void clear_sgx_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_SGX); - setup_clear_cpu_cap(X86_FEATURE_SGX_LC); -} - static int __init nosgx(char *str) { - clear_sgx_caps(); + setup_clear_cpu_cap(X86_FEATURE_SGX); return 0; } @@ -110,23 +104,30 @@ early_param("nosgx", nosgx); void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { + bool enable_sgx_kvm = false, enable_sgx_driver = false; bool tboot = tboot_enabled(); - bool enable_sgx; + bool enable_vmx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); return; } - /* - * Enable SGX if and only if the kernel supports SGX and Launch Control - * is supported, i.e. disable SGX if the LE hash MSRs can't be written. - */ - enable_sgx = cpu_has(c, X86_FEATURE_SGX) && - cpu_has(c, X86_FEATURE_SGX_LC) && - IS_ENABLED(CONFIG_X86_SGX); + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector * for the kernel, e.g. using VMX to hide malicious code. */ - if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + if (enable_vmx) { msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; if (tboot) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } - if (enable_sgx) - msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } wrmsrl(MSR_IA32_FEAT_CTL, msr); @@ -173,10 +177,29 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) } update_sgx: - if (!(msr & FEAT_CTL_SGX_ENABLED) || - !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { - if (enable_sgx) - pr_err_once("SGX disabled by BIOS\n"); - clear_sgx_caps(); + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } } } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 866c9a9bcdee7b1eb4c525aea2fb74ab5ec82882..839b54a08e09e0c4bbb86d00a1159885d10b379c 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index 91d3dc784a2925ec39d8a2251822df36d41ec017..9c1656779b2a058ff6f15d30bbac00ca557fd148 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -3,3 +3,4 @@ obj-y += \ encl.o \ ioctl.o \ main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 8ce6d8371cfbf74eba9a4eba624220350d43f1c9..aa9b8b8688676fc66ebc6fdd605fe4467991af13 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct miscdevice sgx_dev_enclave = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_enclave", @@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = { .fops = &sgx_encl_fops, }; -static struct miscdevice sgx_dev_provision = { - .minor = MISC_DYNAMIC_MINOR, - .name = "sgx_provision", - .nodename = "sgx_provision", - .fops = &sgx_provision_fops, -}; - int __init sgx_drv_init(void) { unsigned int eax, ebx, ecx, edx; @@ -187,11 +176,5 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = misc_register(&sgx_dev_provision); - if (ret) { - misc_deregister(&sgx_dev_enclave); - return ret; - } - return 0; } diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 97fb7efce224332c81fb713933b8ddc3a5a067c9..87fed32a30a83950d2010574ca18bb299760d9d6 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); if (ret) { - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(ret); } @@ -404,18 +404,20 @@ void sgx_encl_release(struct kref *ref) if (sgx_unmark_page_reclaimable(entry->epc_page)) continue; - sgx_free_epc_page(entry->epc_page); + sgx_encl_free_epc_page(entry->epc_page); encl->secs_child_cnt--; entry->epc_page = NULL; } kfree(entry); + /* Invoke scheduler to prevent soft lockups. */ + cond_resched(); } xa_destroy(&encl->page_array); if (!encl->secs_child_cnt && encl->secs.epc_page) { - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; } @@ -423,7 +425,7 @@ void sgx_encl_release(struct kref *ref) va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); list_del(&va_page->list); - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); kfree(va_page); } @@ -686,7 +688,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void) ret = __epa(sgx_get_epc_virt_addr(epc_page)); if (ret) { WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(-EFAULT); } @@ -735,3 +737,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page) return slot == SGX_VA_SLOT_COUNT; } + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d8d30ccbef4c95017bec5926f1fab6d04fb12185..fec43ca65065b0caecf5e05dd261c62cf045494e 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *result; - result = find_vma(mm, addr); - if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) return -EINVAL; *vma = result; @@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 443188fe7e7057bb0dba10bda49acfdbf42757e4..9b204843b78d3ec7ff3903548358d39c4665dee8 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,21 +11,6 @@ #include #include "sgx.h" -enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, -}; - /** * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr * @@ -55,6 +40,19 @@ enum sgx_encls_function { } while (0); \ } +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & ENCLS_FAULT_FLAG; +} + /** * encls_failed() - Check if an ENCLS function failed * @ret: the return value of an ENCLS function call @@ -65,7 +63,7 @@ enum sgx_encls_function { */ static inline bool encls_failed(int ret) { - if (ret & ENCLS_FAULT_FLAG) + if (encls_faulted(ret)) return ENCLS_TRAPNR(ret) != X86_TRAP_PF; return !!ret; diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 2e10367ea66cf0ea2d615e6210f3373962925757..83df20e3e633353ca0b707cda6cae51cb6f09714 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include +#include #include #include #include @@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) encl->page_cnt--; if (va_page) { - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); list_del(&va_page->list); kfree(va_page); } @@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) return 0; err_out: - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; err_out_backing: @@ -365,7 +366,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, mmap_read_unlock(current->mm); err_out_free: - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); kfree(encl_page); return ret; @@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { u64 mrsigner[4]; - int i, j, k; + int i, j; void *addr; int ret; @@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, preempt_disable(); - for (k = 0; k < 4; k++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + sgx_update_lepubkeyhash(mrsigner); ret = __einit(sigstruct, token, addr); @@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, } } - if (ret & ENCLS_FAULT_FLAG) { + if (encls_faulted(ret)) { if (encls_failed(ret)) ENCLS_WARN(ret, "EINIT"); @@ -667,24 +667,11 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) { struct sgx_enclave_provision params; - struct file *file; if (copy_from_user(¶ms, arg, sizeof(params))) return -EFAULT; - file = fget(params.fd); - if (!file) - return -EINVAL; - - if (file->f_op != &sgx_provision_fops) { - fput(file); - return -EINVAL; - } - - encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; - - fput(file); - return 0; + return sgx_set_attribute(&encl->attributes_mask, params.fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 13a7599ce7d401cf40c5f8b65dcb748a0240b067..379eeefa744a496a5f1beab8e73be5027f324c22 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include #include #include #include +#include #include #include #include #include #include +#include #include "driver.h" #include "encl.h" #include "encls.h" @@ -25,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); static LIST_HEAD(sgx_active_page_list); static DEFINE_SPINLOCK(sgx_reclaimer_lock); -/* The free page list lock protected variables prepend the lock. */ -static unsigned long sgx_nr_free_pages; +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); /* Nodes with one or more EPC sections. */ static nodemask_t sgx_numa_mask; @@ -294,7 +296,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(encl->secs.epc_page, &secs_backing); - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; sgx_encl_put_backing(&secs_backing, true); @@ -400,14 +402,15 @@ static void sgx_reclaim_pages(void) spin_lock(&node->lock); list_add_tail(&epc_page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); + return atomic_long_read(&sgx_nr_free_pages) < watermark && + !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) @@ -468,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - sgx_nr_free_pages--; spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); return page; } @@ -609,26 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * - * Call EREMOVE for an EPC page and insert it back to the list of free pages. + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; struct sgx_numa_node *node = section->node; - int ret; - - WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); - - ret = __eremove(sgx_get_epc_virt_addr(page)); - if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) - return; spin_lock(&node->lock); list_add_tail(&page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, @@ -657,7 +656,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - sgx_nr_free_pages += nr_pages; return true; } @@ -731,6 +729,67 @@ static bool __init sgx_page_cache_init(void) return true; } +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + static int __init sgx_init(void) { int ret; @@ -747,12 +806,28 @@ static int __init sgx_init(void) goto err_page_cache; } - ret = sgx_drv_init(); + ret = misc_register(&sgx_dev_provision); if (ret) goto err_kthread; + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ + ret = sgx_drv_init(); + + if (sgx_vepc_init() && ret) + goto err_provision; + return 0; +err_provision: + misc_deregister(&sgx_dev_provision); + err_kthread: kthread_stop(ksgxd_tsk); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 2a2b5c857451debb4d24ec1d38178359416e6732..4628acec000915e2d2b41895088c3edbe5033c14 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -13,6 +13,10 @@ #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 #define SGX_NR_TO_SCAN 16 @@ -80,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 0000000000000000000000000000000000000000..b4f9a50de776e29cf058afb1575da4fceec1ad0b --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) +{ + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + return __eremove(sgx_get_epc_virt_addr(epc_page)); +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret = sgx_vepc_remove_page(epc_page); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + return 0; +} + +static long sgx_vepc_remove_all(struct sgx_vepc *vepc) +{ + struct sgx_epc_page *entry; + unsigned long index; + long failures = 0; + + xa_for_each(&vepc->page_array, index, entry) { + int ret = sgx_vepc_remove_page(entry); + if (ret) { + if (ret == SGX_CHILD_PRESENT) { + /* The page is a SECS, userspace will retry. */ + failures++; + } else { + /* + * Report errors due to #GP or SGX_ENCLAVE_ACT; do not + * WARN, as userspace can induce said failures by + * calling the ioctl concurrently on multiple vEPCs or + * while one or more CPUs is running the enclave. Only + * a #PF on EREMOVE indicates a kernel/hardware issue. + */ + WARN_ON_ONCE(encls_faulted(ret) && + ENCLS_TRAPNR(ret) != X86_TRAP_GP); + return -EBUSY; + } + } + cond_resched(); + } + + /* + * Return the number of SECS pages that failed to be removed, so + * userspace knows that it has to retry. + */ + return failures; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + xa_destroy(&vepc->page_array); + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static long sgx_vepc_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct sgx_vepc *vepc = file->private_data; + + switch (cmd) { + case SGX_IOC_VEPC_REMOVE_ALL: + if (arg) + return -EINVAL; + return sgx_vepc_remove_all(vepc); + + default: + return -ENOTTY; + } +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .unlocked_ioctl = sgx_vepc_ioctl, + .compat_ioctl = sgx_vepc_ioctl, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f92dfd8ef10dbca080c0cf21a3bc45792cab4bea..71fb38d83e4ad0bcd74ea5082699b92c30bb4e53 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -84,6 +84,18 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config X86_SGX_KVM + bool "Software Guard eXtensions (SGX) Virtualization" + depends on X86_SGX && KVM_INTEL + help + + Enables KVM guests to create SGX enclaves. + + This includes support to expose "raw" unreclaimable enclave memory to + guests via a device node, e.g. /dev/sgx_vepc. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b804444e16d47d017db4d24c9a52872701199128..1c6c9adcb730f64d0914162f45bc545cbe8e2cfc 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,8 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o +kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o + kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 41b0dc37720e0f0b572827c6e50d343aa6b43552..911c6efe60aa97683c586fe28c0afc62916c8110 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -28,7 +29,7 @@ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be * aligned to sizeof(unsigned long) because it's not accessed via bitops. */ -u32 kvm_cpu_caps[NCAPINTS] __read_mostly; +u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); static u32 xstate_required_size(u64 xstate_bv, bool compacted) @@ -53,6 +54,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) } #define F feature_bit +#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) @@ -169,6 +171,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); + if (best) { + best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; + best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } + kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -330,13 +347,13 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, return r; } -static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) +/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ +static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) { const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); struct kvm_cpuid_entry2 entry; reverse_cpuid_check(leaf); - kvm_cpu_caps[leaf] &= mask; cpuid_count(cpuid.function, cpuid.index, &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); @@ -344,6 +361,27 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); } +static __always_inline +void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) +{ + /* Use kvm_cpu_cap_mask for non-scattered leafs. */ + BUILD_BUG_ON(leaf < NCAPINTS); + + kvm_cpu_caps[leaf] = mask; + + __kvm_cpu_cap_mask(leaf); +} + +static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) +{ + /* Use kvm_cpu_cap_init_scattered for scattered leafs. */ + BUILD_BUG_ON(leaf >= NCAPINTS); + + kvm_cpu_caps[leaf] &= mask; + + __kvm_cpu_cap_mask(leaf); +} + void kvm_set_cpu_caps(void) { unsigned int f_nx = is_efer_nx() ? F(NX) : 0; @@ -354,12 +392,13 @@ void kvm_set_cpu_caps(void) unsigned int f_gbpages = 0; unsigned int f_lm = 0; #endif + memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); - BUILD_BUG_ON(sizeof(kvm_cpu_caps) > + BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability, - sizeof(kvm_cpu_caps)); + sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps))); kvm_cpu_cap_mask(CPUID_1_ECX, /* @@ -390,7 +429,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | @@ -401,7 +440,8 @@ void kvm_set_cpu_caps(void) F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | + F(SGX_LC) ); /* Set LA57 based on hardware capability. */ if (cpuid_ecx(7) & F(LA57)) @@ -440,6 +480,10 @@ void kvm_set_cpu_caps(void) F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) ); + kvm_cpu_cap_init_scattered(CPUID_12_EAX, + SF(SGX1) | SF(SGX2) + ); + kvm_cpu_cap_mask(CPUID_8000_0001_ECX, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | @@ -763,6 +807,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; } break; + case 0x12: + /* Intel SGX */ + if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + /* + * Index 0: Sub-features, MISCSELECT (a.k.a extended features) + * and max enclave sizes. The SGX sub-features and MISCSELECT + * are restricted by kernel and KVM capabilities (like most + * feature flags), while enclave size is unrestricted. + */ + cpuid_entry_override(entry, CPUID_12_EAX); + entry->ebx &= SGX_MISC_EXINFO; + + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + /* + * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la + * feature flags. Advertise all supported flags, including + * privileged attributes that require explicit opt-in from + * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is + * expected to derive it from supported XCR0. + */ + entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | + SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY | + SGX_ATTR_KSS; + entry->ebx &= 0; + break; /* Intel PT */ case 0x14: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index dc921d76e42e8b353989ea0a96962640085e95b8..9fbc0de7c337b8fb4c3e2b3ce27aa659eaf95438 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -7,7 +7,25 @@ #include #include -extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; +/* + * Hardware-defined CPUID leafs that are scattered in the kernel, but need to + * be directly used by KVM. Note, these word values conflict with the kernel's + * "bug" caps, but KVM doesn't use those. + */ +enum kvm_only_cpuid_leafs { + CPUID_12_EAX = NCAPINTS, + NR_KVM_CPU_CAPS, + + NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, +}; + +#define KVM_X86_FEATURE(w, f) ((w)*32 + (f)) + +/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ +#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) +#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) + +extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); @@ -63,6 +81,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, + [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, }; /* @@ -83,6 +102,25 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); } +/* + * Translate feature bits that are scattered in the kernel's cpufeatures word + * into KVM feature words that align with hardware's definitions. + */ +static __always_inline u32 __feature_translate(int x86_feature) +{ + if (x86_feature == X86_FEATURE_SGX1) + return KVM_X86_FEATURE_SGX1; + else if (x86_feature == X86_FEATURE_SGX2) + return KVM_X86_FEATURE_SGX2; + + return x86_feature; +} + +static __always_inline u32 __feature_leaf(int x86_feature) +{ + return __feature_translate(x86_feature) / 32; +} + /* * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain * the hardware defined bit number (stored in bits 4:0) and a software defined @@ -91,6 +129,8 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) */ static __always_inline u32 __feature_bit(int x86_feature) { + x86_feature = __feature_translate(x86_feature); + reverse_cpuid_check(x86_feature / 32); return 1 << (x86_feature & 31); } @@ -99,7 +139,7 @@ static __always_inline u32 __feature_bit(int x86_feature) static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); return reverse_cpuid[x86_leaf]; @@ -178,7 +218,7 @@ static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry, } static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, - enum cpuid_leafs leaf) + unsigned int leaf) { u32 *reg = cpuid_entry_get_reg(entry, leaf * 32); @@ -291,7 +331,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); @@ -299,7 +339,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); @@ -307,7 +347,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0c2389d0fdafe23c9e90cf4e05e893e100739f8e..d2a0464e95c7a8351b70809b4148277ec3d11c47 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -11,6 +11,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "x86.h" @@ -2326,6 +2327,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); + secondary_exec_controls_set(vmx, exec_control); } @@ -4142,6 +4146,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; + if (to_vmx(vcpu)->exit_reason.enclave_mode) + vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; @@ -5736,6 +5742,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 encls_leaf; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) + return false; + + encls_leaf = kvm_rax_read(vcpu); + if (encls_leaf > 62) + encls_leaf = 63; + return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); +} + static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, gpa_t bitmap) { @@ -5833,9 +5854,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return true; default: break; } @@ -5959,6 +5977,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); + case EXIT_REASON_ENCLS: + return nested_vmx_exit_handled_encls(vcpu, vmcs12); default: return true; } @@ -6534,6 +6554,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_high |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (enable_sgx) + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 197148d76b8fdb8ab8d0dc370e56525f28426608..184418baeb3cbaaf8bd70c2d2e3ea51f87fafdc2 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING); +} + /* * if fixed0[i] == 1: val[i] must be 1 * if fixed1[i] == 0: val[i] must be 0 diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c new file mode 100644 index 0000000000000000000000000000000000000000..6693ebdc07701449bb297de0ef68873f1f53d3b2 --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. */ + +#include + +#include "cpuid.h" +#include "kvm_cache_regs.h" +#include "nested.h" +#include "sgx.h" +#include "vmx.h" +#include "x86.h" + +bool __read_mostly enable_sgx = 1; +module_param_named(sgx, enable_sgx, bool, 0444); + +/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ +static u64 sgx_pubkey_hash[4] __ro_after_init; + +/* + * ENCLS's memory operands use a fixed segment (DS) and a fixed + * address size based on the mode. Related prefixes are ignored. + */ +static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, + int size, int alignment, gva_t *gva) +{ + struct kvm_segment s; + bool fault; + + /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ + *gva = offset; + if (!is_long_mode(vcpu)) { + vmx_get_segment(vcpu, &s, VCPU_SREG_DS); + *gva += s.base; + } + + if (!IS_ALIGNED(*gva, alignment)) { + fault = true; + } else if (likely(is_long_mode(vcpu))) { + fault = is_noncanonical_address(*gva, vcpu); + } else { + *gva &= 0xffffffff; + fault = (s.unusable) || + (s.type != 2 && s.type != 3) || + (*gva > s.limit) || + ((s.base != 0 || s.limit != 0xffffffff) && + (((u64)*gva + size - 1) > s.limit + 1)); + } + if (fault) + kvm_inject_gp(vcpu, 0); + return fault ? -EINVAL : 0; +} + +static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, + unsigned int size) +{ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = addr; + vcpu->run->internal.data[1] = size; +} + +static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, + unsigned int size) +{ + if (__copy_from_user(data, (void __user *)hva, size)) { + sgx_handle_emulation_failure(vcpu, hva, size); + return -EFAULT; + } + + return 0; +} + +static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, + gpa_t *gpa) +{ + struct x86_exception ex; + + if (write) + *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex); + else + *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); + + if (*gpa == UNMAPPED_GVA) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return -EFAULT; + } + + return 0; +} + +static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva) +{ + *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa)); + if (kvm_is_error_hva(*hva)) { + sgx_handle_emulation_failure(vcpu, gpa, 1); + return -EFAULT; + } + + *hva |= gpa & ~PAGE_MASK; + + return 0; +} + +static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) +{ + struct x86_exception ex; + + /* + * A non-EPCM #PF indicates a bad userspace HVA. This *should* check + * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC, + * but the error code isn't (yet) plumbed through the ENCLS helpers. + */ + if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + /* + * If the guest thinks it's running on SGX2 hardware, inject an SGX + * #PF if the fault matches an EPCM fault signature (#GP on SGX1, + * #PF on SGX2). The assumption is that EPCM faults are much more + * likely than a bad userspace address. + */ + if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && + guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + memset(&ex, 0, sizeof(ex)); + ex.vector = PF_VECTOR; + ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | + PFERR_SGX_MASK; + ex.address = gva; + ex.error_code_valid = true; + ex.nested_page_fault = false; + kvm_inject_page_fault(vcpu, &ex); + } else { + kvm_inject_gp(vcpu, 0); + } + return 1; +} + +static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, + struct sgx_pageinfo *pageinfo, + unsigned long secs_hva, + gva_t secs_gva) +{ + struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents; + struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1; + u64 attributes, xfrm, size; + u32 miscselect; + u8 max_size_log2; + int trapnr, ret; + + sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!sgx_12_0 || !sgx_12_1) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + miscselect = contents->miscselect; + attributes = contents->attributes; + xfrm = contents->xfrm; + size = contents->size; + + /* Enforce restriction of access to the PROVISIONKEY. */ + if (!vcpu->kvm->arch.sgx_provisioning_allowed && + (attributes & SGX_ATTR_PROVISIONKEY)) { + if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) + pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */ + if ((u32)miscselect & ~sgx_12_0->ebx || + (u32)attributes & ~sgx_12_1->eax || + (u32)(attributes >> 32) & ~sgx_12_1->ebx || + (u32)xfrm & ~sgx_12_1->ecx || + (u32)(xfrm >> 32) & ~sgx_12_1->edx) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restriction on max enclave size. */ + max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : + sgx_12_0->edx; + if (size >= BIT_ULL(max_size_log2)) + kvm_inject_gp(vcpu, 0); + + /* + * sgx_virt_ecreate() returns: + * 1) 0: ECREATE was successful + * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the + * exception number. + * 3) -EINVAL: access_ok() on @secs_hva failed. This should never + * happen as KVM checks host addresses at memslot creation. + * sgx_virt_ecreate() has already warned in this case. + */ + ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr); + if (!ret) + return kvm_skip_emulated_instruction(vcpu); + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + return ret; +} + +static int handle_encls_ecreate(struct kvm_vcpu *vcpu) +{ + gva_t pageinfo_gva, secs_gva; + gva_t metadata_gva, contents_gva; + gpa_t metadata_gpa, contents_gpa, secs_gpa; + unsigned long metadata_hva, contents_hva, secs_hva; + struct sgx_pageinfo pageinfo; + struct sgx_secs *contents; + struct x86_exception ex; + int r; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva)) + return 1; + + /* + * Copy the PAGEINFO to local memory, its pointers need to be + * translated, i.e. we need to do a deep copy/translate. + */ + r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo, + sizeof(pageinfo), &ex); + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return 1; + } else if (r != X86EMUL_CONTINUE) { + sgx_handle_emulation_failure(vcpu, pageinfo_gva, + sizeof(pageinfo)); + return 0; + } + + if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) || + sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096, + &contents_gva)) + return 1; + + /* + * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) || + sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. + */ + if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) || + sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva)) + return 0; + + /* + * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the + * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and + * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to + * enforce restriction of access to the PROVISIONKEY. + */ + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!contents) + return -ENOMEM; + + /* Exit to userspace if copying from a host userspace address fails. */ + if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) { + free_page((unsigned long)contents); + return 0; + } + + pageinfo.metadata = metadata_hva; + pageinfo.contents = (u64)contents; + + r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva); + + free_page((unsigned long)contents); + + return r; +} + +static int handle_encls_einit(struct kvm_vcpu *vcpu) +{ + unsigned long sig_hva, secs_hva, token_hva, rflags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t sig_gva, secs_gva, token_gva; + gpa_t sig_gpa, secs_gpa, token_gpa; + int ret, trapnr; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) || + sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva)) + return 1; + + /* + * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) || + sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. Note, all structures are aligned and + * cannot split pages. + */ + if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) || + sgx_gpa_to_hva(vcpu, token_gpa, &token_hva)) + return 0; + + ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva, + (void __user *)secs_hva, + vmx->msr_ia32_sgxlepubkeyhash, &trapnr); + + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + /* + * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva, + * @token_hva or @secs_hva. This should never happen as KVM checks host + * addresses at memslot creation. sgx_virt_einit() has already warned + * in this case, so just return. + */ + if (ret < 0) + return ret; + + rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | + X86_EFLAGS_AF | X86_EFLAGS_SF | + X86_EFLAGS_OF); + if (ret) + rflags |= X86_EFLAGS_ZF; + else + rflags &= ~X86_EFLAGS_ZF; + vmx_set_rflags(vcpu, rflags); + + kvm_rax_write(vcpu, ret); + return kvm_skip_emulated_instruction(vcpu); +} + +static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) +{ + if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + return false; + + if (leaf >= ECREATE && leaf <= ETRACK) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX1); + + if (leaf >= EAUG && leaf <= EMODT) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + + return false; +} + +static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu) +{ + const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED; + + return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits; +} + +int handle_encls(struct kvm_vcpu *vcpu) +{ + u32 leaf = (u32)kvm_rax_read(vcpu); + + if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { + kvm_queue_exception(vcpu, UD_VECTOR); + } else if (!sgx_enabled_in_guest_bios(vcpu)) { + kvm_inject_gp(vcpu, 0); + } else { + if (leaf == ECREATE) + return handle_encls_ecreate(vcpu); + if (leaf == EINIT) + return handle_encls_einit(vcpu); + WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; + return 0; + } + return 1; +} + +void setup_default_sgx_lepubkeyhash(void) +{ + /* + * Use Intel's default value for Skylake hardware if Launch Control is + * not supported, i.e. Intel's hash is hardcoded into silicon, or if + * Launch Control is supported and enabled, i.e. mimic the reset value + * and let the guest write the MSRs at will. If Launch Control is + * supported but disabled, then use the current MSR values as the hash + * MSRs exist but are read-only (locked and not writable). + */ + if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || + rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; + sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; + sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; + sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; + } else { + /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + } +} + +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, + sizeof(sgx_pubkey_hash)); +} + +/* + * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM + * restrictions if the guest's allowed-1 settings diverge from hardware. + */ +static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *guest_cpuid; + u32 eax, ebx, ecx, edx; + + if (!vcpu->kvm->arch.sgx_provisioning_allowed) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx || + guest_cpuid->ecx != ecx || guest_cpuid->edx != edx) + return true; + + return false; +} + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + /* + * There is no software enable bit for SGX that is virtualized by + * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the + * guest (either by the host or by the guest's BIOS) but enabled in the + * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate + * the expected system behavior for ENCLS. + */ + u64 bitmap = -1ull; + + /* Nothing to do if hardware doesn't support SGX */ + if (!cpu_has_vmx_encls_vmexit()) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + sgx_enabled_in_guest_bios(vcpu)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); + if (sgx_intercept_encls_ecreate(vcpu)) + bitmap |= (1 << ECREATE); + } + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + bitmap &= ~GENMASK_ULL(EMODT, EAUG); + + /* + * Trap and execute EINIT if launch control is enabled in the + * host using the guest's values for launch control MSRs, even + * if the guest's values are fixed to hardware default values. + * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing + * the MSRs is extraordinarily expensive. + */ + if (boot_cpu_has(X86_FEATURE_SGX_LC)) + bitmap |= (1 << EINIT); + + if (!vmcs12 && is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + if (vmcs12 && nested_cpu_has_encls_exit(vmcs12)) + bitmap |= vmcs12->encls_exiting_bitmap; + } + vmcs_write64(ENCLS_EXITING_BITMAP, bitmap); +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h new file mode 100644 index 0000000000000000000000000000000000000000..a400888b376d31527d8d2ffb0747e8eada015835 --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_SGX_H +#define __KVM_X86_SGX_H + +#include + +#include "capabilities.h" +#include "vmx_ops.h" + +#ifdef CONFIG_X86_SGX_KVM +extern bool __read_mostly enable_sgx; + +int handle_encls(struct kvm_vcpu *vcpu); + +void setup_default_sgx_lepubkeyhash(void); +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); +#else +#define enable_sgx 0 + +static inline void setup_default_sgx_lepubkeyhash(void) { } +static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } + +static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + /* Nothing to do if hardware doesn't support SGX */ + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); +} +#endif + +#endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index c8e51c004f78232a0e8ae66eb5d108e359112e8a..034adb6404dcaf0062998b43d2d7e88c4521e16a 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VMREAD_BITMAP, vmread_bitmap), FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), + FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 80232daf00ff1368df3e4f01a2732b49e678e86a..13494956d0e97f7cb6435de3e58c590b343c7093 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -69,7 +69,8 @@ struct __packed vmcs12 { u64 vm_function_control; u64 eptp_list_address; u64 pml_address; - u64 padding64[3]; /* room for future expansion */ + u64 encls_exiting_bitmap; + u64 padding64[2]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(vm_function_control, 296); CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(encls_exiting_bitmap, 320); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e208e54f5cad9156103e2e71a92b8d08547c5039..b737f696efd08112897cdbf3361fca124641f650 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -58,6 +58,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "vmcs.h" #include "vmcs12.h" @@ -1573,12 +1574,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { + /* + * Emulation of instructions in SGX enclaves is impossible as RIP does + * not point tthe failing instruction, and even if it did, the code + * stream is inaccessible. Inject #UD instead of exiting to userspace + * so that guest userspace can't DoS the guest simply by triggering + * emulation (enclaves are CPL3 only). + */ + if (to_vmx(vcpu)->exit_reason.enclave_mode) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } return true; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { + union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; unsigned long rip, orig_rip; + u32 instr_len; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on @@ -1589,9 +1603,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + /* + * Emulating an enclave's instructions isn't supported as KVM + * cannot access the enclave's memory or its true RIP, e.g. the + * vmcs.GUEST_RIP points at the exit point of the enclave, not + * the RIP that actually triggered the VM-Exit. But, because + * most instructions that cause VM-Exit will #UD in an enclave, + * most instruction-based VM-Exits simply do not occur. + * + * There are a few exceptions, notably the debug instructions + * INT1ICEBRK and INT3, as they are allowed in debug enclaves + * and generate #DB/#BP as expected, which KVM might intercept. + * But again, the CPU does the dirty work and saves an instr + * length of zero so VMMs don't shoot themselves in the foot. + * WARN if KVM tries to skip a non-zero length instruction on + * a VM-Exit from an enclave. + */ + if (!instr_len) + goto rip_updated; + + WARN(exit_reason.enclave_mode, + "KVM: skipping instruction after SGX enclave VM-Exit"); + orig_rip = kvm_rip_read(vcpu); - rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit @@ -1607,6 +1645,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) return 0; } +rip_updated: /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); @@ -1868,6 +1907,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_FEAT_CTL: msr_info->data = vmx->msr_ia32_feature_control; break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + return 1; + msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash + [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; + break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2162,6 +2208,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); + + /* SGX may be enabled/disabled by guest's firmware */ + vmx_write_encls_bitmap(vcpu, NULL); + break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + /* + * On real hardware, the LE hash MSRs are writable before + * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), + * at which point SGX related bits in IA32_FEATURE_CONTROL + * become writable. + * + * KVM does not emulate SGX activation for simplicity, so + * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL + * is unlocked. This is technically not architectural + * behavior, but it's close enough. + */ + if (!msr_info->host_initiated && + (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && + !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) + return 1; + vmx->msr_ia32_sgxlepubkeyhash + [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!msr_info->host_initiated) @@ -4427,8 +4496,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } - if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmx_write_encls_bitmap(&vmx->vcpu, NULL); if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); @@ -5445,6 +5513,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; + if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + return 1; + /* * A nested guest cannot optimize MMIO vmexits, because we have an * nGPA here instead of the required GPA. @@ -5696,16 +5767,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) return 1; } +#ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { /* - * SGX virtualization is not yet supported. There is no software - * enable bit for SGX, so we have to trap ENCLS and inject a #UD - * to prevent the guest from executing ENCLS. + * SGX virtualization is disabled. There is no software enable bit for + * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent + * the guest from executing ENCLS (when SGX is supported by hardware). */ kvm_queue_exception(vcpu, UD_VECTOR); return 1; } +#endif /* CONFIG_X86_SGX_KVM */ /* * The exit handlers return 1 if the exit was handled fully and guest execution @@ -7031,6 +7104,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); + vcpu_setup_sgx_lepubkeyhash(vcpu); + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -7357,6 +7432,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) set_cr4_guest_host_mask(vmx); + vmx_write_encls_bitmap(vcpu, NULL); + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + vmx->msr_ia32_feature_control_valid_bits |= + FEAT_CTL_SGX_LC_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= + ~FEAT_CTL_SGX_LC_ENABLED; + /* Refresh #PF interception to account for MAXPHYADDR changes. */ update_exception_bitmap(vcpu); } @@ -7377,6 +7465,13 @@ static __init void vmx_set_cpu_caps(void) if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (!enable_sgx) { + kvm_cpu_cap_clear(X86_FEATURE_SGX); + kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); + kvm_cpu_cap_clear(X86_FEATURE_SGX1); + kvm_cpu_cap_clear(X86_FEATURE_SGX2); + } + if (vmx_umip_emulated()) kvm_cpu_cap_set(X86_FEATURE_UMIP); @@ -7973,6 +8068,8 @@ static __init int hardware_setup(void) if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + setup_default_sgx_lepubkeyhash(); + if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c0b52498e4bb465c77c9f3a84b8cb32e778e158a..04f1f854417774e100d6a3dac9a9d8101b8e377b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -324,6 +324,9 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + /* SGX Launch Control public key hash */ + u64 msr_ia32_sgxlepubkeyhash[4]; + u64 ept_pointer; struct pt_desc pt_desc; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7da272aea5c4b203a5d65d353a909ef15702b2b0..e3896ba64eca41d111257de0200c48898e70e905 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -3839,6 +3840,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_USER_SPACE_MSR: case KVM_CAP_X86_MSR_FILTER: case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: +#endif r = 1; break; case KVM_CAP_SYNC_REGS: @@ -5391,6 +5395,23 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: { + unsigned long allowed_attributes = 0; + + r = sgx_set_attribute(&allowed_attributes, cap->args[0]); + if (r) + break; + + /* KVM only supports the PROVISIONKEY privileged attribute. */ + if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && + !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) + kvm->arch.sgx_provisioning_allowed = true; + else + r = -EINVAL; + break; + } +#endif default: r = -EINVAL; break; @@ -6002,6 +6023,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) @@ -6018,6 +6040,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 34112c63b34798d99e2ee6e672353d98e9a64614..058ff3f6944c840778d54f0377e873836b23b14b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1123,6 +1123,18 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) if (error_code & X86_PF_PK) return 1; + /* + * SGX hardware blocked the access. This usually happens + * when the enclave memory contents have been destroyed, like + * after a suspend/resume cycle. In any case, the kernel can't + * fix the cause of the fault. Handle the fault as an access + * error even in cases where no actual access violation + * occurred. This allows userspace to rebuild the enclave in + * response to the signal. + */ + if (unlikely(error_code & X86_PF_SGX)) + return 1; + /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 50a86fd89d00524023c7fae53ef7ced05a2270a0..9aa4a6ce9fbf0bd0a171cbc25c12440e54f753a3 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -230,7 +230,7 @@ vma_create(struct drm_i915_gem_object *obj, } static struct i915_vma * -vma_lookup(struct drm_i915_gem_object *obj, +i915_vma_lookup(struct drm_i915_gem_object *obj, struct i915_address_space *vm, const struct i915_ggtt_view *view) { @@ -278,7 +278,7 @@ i915_vma_instance(struct drm_i915_gem_object *obj, GEM_BUG_ON(!atomic_read(&vm->open)); spin_lock(&obj->vma.lock); - vma = vma_lookup(obj, vm, view); + vma = i915_vma_lookup(obj, vm, view); spin_unlock(&obj->vma.lock); /* vma_create() will resolve the race if another creates the vma */ diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index d79335506ecd3c3f5aa3beabde8e0dd74cef10a3..47551ab73ca8a03ab60448adb8fe0d8cee9aed5d 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -37,7 +37,7 @@ */ /* un-comment DEBUG to enable pr_debug() statements */ -#define DEBUG +/* #define DEBUG */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -64,11 +64,17 @@ static struct cpuidle_driver intel_idle_driver = { /* intel_idle.max_cstate=0 disables driver */ static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int disabled_states_mask; +static unsigned int preferred_states_mask; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; static unsigned long auto_demotion_disable_flags; -static bool disable_promotion_to_c1e; + +static enum { + C1E_PROMOTION_PRESERVE, + C1E_PROMOTION_ENABLE, + C1E_PROMOTION_DISABLE +} c1e_promotion = C1E_PROMOTION_PRESERVE; struct idle_cpu { struct cpuidle_state *state_table; @@ -88,6 +94,12 @@ static struct cpuidle_state *cpuidle_state_table __initdata; static unsigned int mwait_substates __initdata; +/* + * Enable interrupts before entering the C-state. On some platforms and for + * some C-states, this may measurably decrease interrupt latency. + */ +#define CPUIDLE_FLAG_IRQ_ENABLE BIT(14) + /* * Enable this state by default even if the ACPI _CST does not list it. */ @@ -115,9 +127,6 @@ static unsigned int mwait_substates __initdata; * If the local APIC timer is not known to be reliable in the target idle state, * enable one-shot tick broadcasting for the target CPU before executing MWAIT. * - * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to - * flushing user TLBs. - * * Must be called under local_irq_disable(). */ static __cpuidle int intel_idle(struct cpuidle_device *dev, @@ -127,6 +136,9 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, unsigned long eax = flg2MWAIT(state->flags); unsigned long ecx = 1; /* break on interrupt flag */ + if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) + local_irq_enable(); + mwait_idle_with_hints(eax, ecx); return index; @@ -698,7 +710,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { { .name = "C1", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00), + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE, .exit_latency = 2, .target_residency = 2, .enter = &intel_idle, @@ -727,7 +739,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { { .name = "C1", .desc = "MWAIT 0x00", - .flags = MWAIT2flg(0x00), + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE, .exit_latency = 1, .target_residency = 1, .enter = &intel_idle, @@ -744,8 +756,48 @@ static struct cpuidle_state icx_cstates[] __initdata = { .name = "C6", .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 128, - .target_residency = 384, + .exit_latency = 170, + .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + +/* + * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice + * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in + * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1 + * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then + * both C1 and C1E requests end up with C1, so there is effectively no C1E. + * + * By default we enable C1 and disable C1E by marking it with + * 'CPUIDLE_FLAG_UNUSABLE'. + */ +static struct cpuidle_state spr_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | + CPUIDLE_FLAG_UNUSABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 290, + .target_residency = 800, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -963,6 +1015,39 @@ static struct cpuidle_state dnv_cstates[] __initdata = { .enter = NULL } }; +/* + * Note, depending on HW and FW revision, SnowRidge SoC may or may not support + * C6, and this is indicated in the CPUID mwait leaf. + */ +static struct cpuidle_state snr_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 15, + .target_residency = 25, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 130, + .target_residency = 500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static const struct idle_cpu idle_cpu_nehalem __initconst = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, @@ -1062,6 +1147,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_spr __initconst = { + .state_table = spr_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct idle_cpu idle_cpu_avn __initconst = { .state_table = avn_cstates, .disable_promotion_to_c1e = true, @@ -1084,6 +1175,12 @@ static const struct idle_cpu idle_cpu_dnv __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_snr __initconst = { + .state_table = snr_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &idle_cpu_nhx), X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &idle_cpu_nehalem), @@ -1117,12 +1214,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &idle_cpu_skl), X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &idle_cpu_bxt), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &idle_cpu_dnv), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &idle_cpu_dnv), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &idle_cpu_snr), {} }; @@ -1444,6 +1543,68 @@ static void __init sklh_idle_state_table_update(void) skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE; /* C9-SKL */ } +/** + * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake + * idle states table. + */ +static void __init skx_idle_state_table_update(void) +{ + unsigned long long msr; + + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr); + + /* + * 000b: C0/C1 (no package C-state support) + * 001b: C2 + * 010b: C6 (non-retention) + * 011b: C6 (retention) + * 111b: No Package C state limits. + */ + if ((msr & 0x7) < 2) { + /* + * Uses the CC6 + PC0 latency and 3 times of + * latency for target_residency if the PC6 + * is disabled in BIOS. This is consistent + * with how intel_idle driver uses _CST + * to set the target_residency. + */ + skx_cstates[2].exit_latency = 92; + skx_cstates[2].target_residency = 276; + } +} + +/** + * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. + */ +static void __init spr_idle_state_table_update(void) +{ + unsigned long long msr; + + /* Check if user prefers C1E over C1. */ + if ((preferred_states_mask & BIT(2)) && + !(preferred_states_mask & BIT(1))) { + /* Disable C1 and enable C1E. */ + spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE; + spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE; + + /* Enable C1E using the "C1E promotion" bit. */ + c1e_promotion = C1E_PROMOTION_ENABLE; + } + + /* + * By default, the C6 state assumes the worst-case scenario of package + * C6. However, if PC6 is disabled, we update the numbers to match + * core C6. + */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr); + + /* Limit value 2 and above allow for PC6. */ + if ((msr & 0x7) < 2) { + spr_cstates[2].exit_latency = 190; + spr_cstates[2].target_residency = 600; + } +} + static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) { unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1; @@ -1475,6 +1636,12 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_FAM6_SKYLAKE: sklh_idle_state_table_update(); break; + case INTEL_FAM6_SKYLAKE_X: + skx_idle_state_table_update(); + break; + case INTEL_FAM6_SAPPHIRERAPIDS_X: + spr_idle_state_table_update(); + break; } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { @@ -1547,6 +1714,15 @@ static void auto_demotion_disable(void) wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits); } +static void c1e_promotion_enable(void) +{ + unsigned long long msr_bits; + + rdmsrl(MSR_IA32_POWER_CTL, msr_bits); + msr_bits |= 0x2; + wrmsrl(MSR_IA32_POWER_CTL, msr_bits); +} + static void c1e_promotion_disable(void) { unsigned long long msr_bits; @@ -1578,7 +1754,9 @@ static int intel_idle_cpu_init(unsigned int cpu) if (auto_demotion_disable_flags) auto_demotion_disable(); - if (disable_promotion_to_c1e) + if (c1e_promotion == C1E_PROMOTION_ENABLE) + c1e_promotion_enable(); + else if (c1e_promotion == C1E_PROMOTION_DISABLE) c1e_promotion_disable(); return 0; @@ -1657,7 +1835,8 @@ static int __init intel_idle_init(void) if (icpu) { cpuidle_state_table = icpu->state_table; auto_demotion_disable_flags = icpu->auto_demotion_disable_flags; - disable_promotion_to_c1e = icpu->disable_promotion_to_c1e; + if (icpu->disable_promotion_to_c1e) + c1e_promotion = C1E_PROMOTION_DISABLE; if (icpu->use_acpi || force_use_acpi) intel_idle_acpi_cst_extract(); } else if (!intel_idle_acpi_cst_extract()) { @@ -1716,3 +1895,14 @@ module_param(max_cstate, int, 0444); */ module_param_named(states_off, disabled_states_mask, uint, 0444); MODULE_PARM_DESC(states_off, "Mask of disabled idle states"); +/* + * Some platforms come with mutually exclusive C-states, so that if one is + * enabled, the other C-states must not be used. Example: C1 and C1E on + * Sapphire Rapids platform. This parameter allows for selecting the + * preferred C-states among the groups of mutually exclusive C-states - the + * selected C-states will be registered, the other C-states from the mutually + * exclusive group won't be registered. If the platform has no mutually + * exclusive C-states, this parameter has no effect. + */ +module_param_named(preferred_cstates, preferred_states_mask, uint, 0444); +MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states"); diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 15680c3c92792aaca3b3b8e83b8b4e87bf82d1fe..3f9f84f9f2880c68c8cf0e0cac511585d7aa401e 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -699,6 +699,16 @@ config MFD_INTEL_PMC_BXT Register and P-unit access. In addition this creates devices for iTCO watchdog and telemetry that are part of the PMC. +config MFD_INTEL_PMT + tristate "Intel Platform Monitoring Technology (PMT) support" + depends on PCI + select MFD_CORE + help + The Intel Platform Monitoring Technology (PMT) is an interface that + provides access to hardware monitor registers. This driver supports + Telemetry, Watcher, and Crashlog PMT capabilities/devices for + platforms starting from Tiger Lake. + config MFD_IPAQ_MICRO bool "Atmel Micro ASIC (iPAQ h3100/h3600/h3700) Support" depends on SA1100_H3100 || SA1100_H3600 diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index fb1df45a301e5d0d1ed21eaeb15419f79ede0558..ce8f1c0583d5ccf741f663bcb7d3563d09767b42 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -216,6 +216,7 @@ obj-$(CONFIG_MFD_INTEL_LPSS_PCI) += intel-lpss-pci.o obj-$(CONFIG_MFD_INTEL_LPSS_ACPI) += intel-lpss-acpi.o obj-$(CONFIG_MFD_INTEL_MSIC) += intel_msic.o obj-$(CONFIG_MFD_INTEL_PMC_BXT) += intel_pmc_bxt.o +obj-$(CONFIG_MFD_INTEL_PMT) += intel_pmt.o obj-$(CONFIG_MFD_PALMAS) += palmas.o obj-$(CONFIG_MFD_VIPERBOARD) += viperboard.o obj-$(CONFIG_MFD_RC5T583) += rc5t583.o rc5t583-irq.o diff --git a/drivers/mfd/intel_pmt.c b/drivers/mfd/intel_pmt.c new file mode 100644 index 0000000000000000000000000000000000000000..dd7eb614c28e47d56fd88fb80cf501e0e67fd12b --- /dev/null +++ b/drivers/mfd/intel_pmt.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Platform Monitoring Technology PMT driver + * + * Copyright (c) 2020, Intel Corporation. + * All Rights Reserved. + * + * Author: David E. Box + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Intel DVSEC capability vendor space offsets */ +#define INTEL_DVSEC_ENTRIES 0xA +#define INTEL_DVSEC_SIZE 0xB +#define INTEL_DVSEC_TABLE 0xC +#define INTEL_DVSEC_TABLE_BAR(x) ((x) & GENMASK(2, 0)) +#define INTEL_DVSEC_TABLE_OFFSET(x) ((x) & GENMASK(31, 3)) +#define INTEL_DVSEC_ENTRY_SIZE 4 + +/* PMT capabilities */ +#define DVSEC_INTEL_ID_TELEMETRY 2 +#define DVSEC_INTEL_ID_WATCHER 3 +#define DVSEC_INTEL_ID_CRASHLOG 4 + +struct intel_dvsec_header { + u16 length; + u16 id; + u8 num_entries; + u8 entry_size; + u8 tbir; + u32 offset; +}; + +enum pmt_quirks { + /* Watcher capability not supported */ + PMT_QUIRK_NO_WATCHER = BIT(0), + + /* Crashlog capability not supported */ + PMT_QUIRK_NO_CRASHLOG = BIT(1), + + /* Use shift instead of mask to read discovery table offset */ + PMT_QUIRK_TABLE_SHIFT = BIT(2), + + /* DVSEC not present (provided in driver data) */ + PMT_QUIRK_NO_DVSEC = BIT(3), +}; + +struct pmt_platform_info { + unsigned long quirks; + struct intel_dvsec_header **capabilities; +}; + +static const struct pmt_platform_info tgl_info = { + .quirks = PMT_QUIRK_NO_WATCHER | PMT_QUIRK_NO_CRASHLOG | + PMT_QUIRK_TABLE_SHIFT, +}; + +/* DG1 Platform with DVSEC quirk*/ +static struct intel_dvsec_header dg1_telemetry = { + .length = 0x10, + .id = 2, + .num_entries = 1, + .entry_size = 3, + .tbir = 0, + .offset = 0x466000, +}; + +static struct intel_dvsec_header *dg1_capabilities[] = { + &dg1_telemetry, + NULL +}; + +static const struct pmt_platform_info dg1_info = { + .quirks = PMT_QUIRK_NO_DVSEC, + .capabilities = dg1_capabilities, +}; + +static int pmt_add_dev(struct pci_dev *pdev, struct intel_dvsec_header *header, + unsigned long quirks) +{ + struct device *dev = &pdev->dev; + struct resource *res, *tmp; + struct mfd_cell *cell; + const char *name; + int count = header->num_entries; + int size = header->entry_size; + int id = header->id; + int i; + + switch (id) { + case DVSEC_INTEL_ID_TELEMETRY: + name = "pmt_telemetry"; + break; + case DVSEC_INTEL_ID_WATCHER: + if (quirks & PMT_QUIRK_NO_WATCHER) { + dev_info(dev, "Watcher not supported\n"); + return -EINVAL; + } + name = "pmt_watcher"; + break; + case DVSEC_INTEL_ID_CRASHLOG: + if (quirks & PMT_QUIRK_NO_CRASHLOG) { + dev_info(dev, "Crashlog not supported\n"); + return -EINVAL; + } + name = "pmt_crashlog"; + break; + default: + return -EINVAL; + } + + if (!header->num_entries || !header->entry_size) { + dev_err(dev, "Invalid count or size for %s header\n", name); + return -EINVAL; + } + + cell = devm_kzalloc(dev, sizeof(*cell), GFP_KERNEL); + if (!cell) + return -ENOMEM; + + res = devm_kcalloc(dev, count, sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + if (quirks & PMT_QUIRK_TABLE_SHIFT) + header->offset >>= 3; + + /* + * The PMT DVSEC contains the starting offset and count for a block of + * discovery tables, each providing access to monitoring facilities for + * a section of the device. Create a resource list of these tables to + * provide to the driver. + */ + for (i = 0, tmp = res; i < count; i++, tmp++) { + tmp->start = pdev->resource[header->tbir].start + + header->offset + i * (size << 2); + tmp->end = tmp->start + (size << 2) - 1; + tmp->flags = IORESOURCE_MEM; + } + + cell->resources = res; + cell->num_resources = count; + cell->name = name; + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, cell, 1, NULL, 0, + NULL); +} + +static int pmt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct pmt_platform_info *info; + unsigned long quirks = 0; + bool found_devices = false; + int ret, pos = 0; + + ret = pcim_enable_device(pdev); + if (ret) + return ret; + + info = (struct pmt_platform_info *)id->driver_data; + + if (info) + quirks = info->quirks; + + if (info && (info->quirks & PMT_QUIRK_NO_DVSEC)) { + struct intel_dvsec_header **header; + + header = info->capabilities; + while (*header) { + ret = pmt_add_dev(pdev, *header, quirks); + if (ret) + dev_warn(&pdev->dev, + "Failed to add device for DVSEC id %d\n", + (*header)->id); + else + found_devices = true; + + ++header; + } + } else { + do { + struct intel_dvsec_header header; + u32 table; + u16 vid; + + pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_DVSEC); + if (!pos) + break; + + pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vid); + if (vid != PCI_VENDOR_ID_INTEL) + continue; + + pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2, + &header.id); + pci_read_config_byte(pdev, pos + INTEL_DVSEC_ENTRIES, + &header.num_entries); + pci_read_config_byte(pdev, pos + INTEL_DVSEC_SIZE, + &header.entry_size); + pci_read_config_dword(pdev, pos + INTEL_DVSEC_TABLE, + &table); + + header.tbir = INTEL_DVSEC_TABLE_BAR(table); + header.offset = INTEL_DVSEC_TABLE_OFFSET(table); + + ret = pmt_add_dev(pdev, &header, quirks); + if (ret) + continue; + + found_devices = true; + } while (true); + } + + if (!found_devices) + return -ENODEV; + + pm_runtime_put(&pdev->dev); + pm_runtime_allow(&pdev->dev); + + return 0; +} + +static void pmt_pci_remove(struct pci_dev *pdev) +{ + pm_runtime_forbid(&pdev->dev); + pm_runtime_get_sync(&pdev->dev); +} + +#define PCI_DEVICE_ID_INTEL_PMT_ADL 0x467d +#define PCI_DEVICE_ID_INTEL_PMT_DG1 0x490e +#define PCI_DEVICE_ID_INTEL_PMT_OOBMSM 0x09a7 +#define PCI_DEVICE_ID_INTEL_PMT_TGL 0x9a0d +static const struct pci_device_id pmt_pci_ids[] = { + { PCI_DEVICE_DATA(INTEL, PMT_ADL, &tgl_info) }, + { PCI_DEVICE_DATA(INTEL, PMT_DG1, &dg1_info) }, + { PCI_DEVICE_DATA(INTEL, PMT_OOBMSM, NULL) }, + { PCI_DEVICE_DATA(INTEL, PMT_TGL, &tgl_info) }, + { } +}; +MODULE_DEVICE_TABLE(pci, pmt_pci_ids); + +static struct pci_driver pmt_pci_driver = { + .name = "intel-pmt", + .id_table = pmt_pci_ids, + .probe = pmt_pci_probe, + .remove = pmt_pci_remove, +}; +module_pci_driver(pmt_pci_driver); + +MODULE_AUTHOR("David E. Box "); +MODULE_DESCRIPTION("Intel Platform Monitoring Technology PMT driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index a1858689d6e10b499ddbae04cc4f92b49a9999f9..a24783aa52eae475a78ffc768b93c95eca457c9e 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1362,6 +1362,42 @@ config INTEL_PMC_CORE - LTR Ignore - MPHY/PLL gating status (Sunrisepoint PCH only) +config INTEL_PMT_CLASS + tristate + help + The Intel Platform Monitoring Technology (PMT) class driver provides + the basic sysfs interface and file hierarchy uses by PMT devices. + + For more information, see: + + + To compile this driver as a module, choose M here: the module + will be called intel_pmt_class. + +config INTEL_PMT_TELEMETRY + tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver" + depends on MFD_INTEL_PMT + select INTEL_PMT_CLASS + help + The Intel Platform Monitory Technology (PMT) Telemetry driver provides + access to hardware telemetry metrics on devices that support the + feature. + + To compile this driver as a module, choose M here: the module + will be called intel_pmt_telemetry. + +config INTEL_PMT_CRASHLOG + tristate "Intel Platform Monitoring Technology (PMT) Crashlog driver" + depends on MFD_INTEL_PMT + select INTEL_PMT_CLASS + help + The Intel Platform Monitoring Technology (PMT) crashlog driver provides + access to hardware crashlog capabilities on devices that support the + feature. + + To compile this driver as a module, choose M here: the module + will be called intel_pmt_crashlog. + config INTEL_PUNIT_IPC tristate "Intel P-Unit IPC Driver" help diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index 5f823f7eff45280248f01197cd086fc81d3de66b..ca82c1344977caf10a56f6602315a5c54c3d8716 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -140,6 +140,9 @@ obj-$(CONFIG_INTEL_MFLD_THERMAL) += intel_mid_thermal.o obj-$(CONFIG_INTEL_MID_POWER_BUTTON) += intel_mid_powerbtn.o obj-$(CONFIG_INTEL_MRFLD_PWRBTN) += intel_mrfld_pwrbtn.o obj-$(CONFIG_INTEL_PMC_CORE) += intel_pmc_core.o intel_pmc_core_pltdrv.o +obj-$(CONFIG_INTEL_PMT_CLASS) += intel_pmt_class.o +obj-$(CONFIG_INTEL_PMT_TELEMETRY) += intel_pmt_telemetry.o +obj-$(CONFIG_INTEL_PMT_CRASHLOG) += intel_pmt_crashlog.o obj-$(CONFIG_INTEL_PUNIT_IPC) += intel_punit_ipc.o obj-$(CONFIG_INTEL_SCU_IPC) += intel_scu_ipc.o obj-$(CONFIG_INTEL_SCU_PCI) += intel_scu_pcidrv.o diff --git a/drivers/platform/x86/intel_pmt_class.c b/drivers/platform/x86/intel_pmt_class.c new file mode 100644 index 0000000000000000000000000000000000000000..c86ff15b1ed522be06f2c670a2f62c3bef70aa7d --- /dev/null +++ b/drivers/platform/x86/intel_pmt_class.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Platform Monitory Technology Telemetry driver + * + * Copyright (c) 2020, Intel Corporation. + * All Rights Reserved. + * + * Author: "Alexander Duyck" + */ + +#include +#include +#include +#include + +#include "intel_pmt_class.h" + +#define PMT_XA_START 0 +#define PMT_XA_MAX INT_MAX +#define PMT_XA_LIMIT XA_LIMIT(PMT_XA_START, PMT_XA_MAX) + +/* + * Early implementations of PMT on client platforms have some + * differences from the server platforms (which use the Out Of Band + * Management Services Module OOBMSM). This list tracks those + * platforms as needed to handle those differences. Newer client + * platforms are expected to be fully compatible with server. + */ +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = { + { PCI_VDEVICE(INTEL, 0x467d) }, /* ADL */ + { PCI_VDEVICE(INTEL, 0x490e) }, /* DG1 */ + { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */ + { } +}; + +bool intel_pmt_is_early_client_hw(struct device *dev) +{ + struct pci_dev *parent = to_pci_dev(dev->parent); + + return !!pci_match_id(pmt_telem_early_client_pci_ids, parent); +} +EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw); + +/* + * sysfs + */ +static ssize_t +intel_pmt_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, loff_t off, + size_t count) +{ + struct intel_pmt_entry *entry = container_of(attr, + struct intel_pmt_entry, + pmt_bin_attr); + + if (off < 0) + return -EINVAL; + + if (off >= entry->size) + return 0; + + if (count > entry->size - off) + count = entry->size - off; + + memcpy_fromio(buf, entry->base + off, count); + + return count; +} + +static int +intel_pmt_mmap(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, struct vm_area_struct *vma) +{ + struct intel_pmt_entry *entry = container_of(attr, + struct intel_pmt_entry, + pmt_bin_attr); + unsigned long vsize = vma->vm_end - vma->vm_start; + struct device *dev = kobj_to_dev(kobj); + unsigned long phys = entry->base_addr; + unsigned long pfn = PFN_DOWN(phys); + unsigned long psize; + + if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) + return -EROFS; + + psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE; + if (vsize > psize) { + dev_err(dev, "Requested mmap size is too large\n"); + return -EINVAL; + } + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + vsize, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static ssize_t +guid_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct intel_pmt_entry *entry = dev_get_drvdata(dev); + + return sprintf(buf, "0x%x\n", entry->guid); +} +static DEVICE_ATTR_RO(guid); + +static ssize_t size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct intel_pmt_entry *entry = dev_get_drvdata(dev); + + return sprintf(buf, "%zu\n", entry->size); +} +static DEVICE_ATTR_RO(size); + +static ssize_t +offset_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct intel_pmt_entry *entry = dev_get_drvdata(dev); + + return sprintf(buf, "%lu\n", offset_in_page(entry->base_addr)); +} +static DEVICE_ATTR_RO(offset); + +static struct attribute *intel_pmt_attrs[] = { + &dev_attr_guid.attr, + &dev_attr_size.attr, + &dev_attr_offset.attr, + NULL +}; +ATTRIBUTE_GROUPS(intel_pmt); + +static struct class intel_pmt_class = { + .name = "intel_pmt", + .owner = THIS_MODULE, + .dev_groups = intel_pmt_groups, +}; + +static int intel_pmt_populate_entry(struct intel_pmt_entry *entry, + struct intel_pmt_header *header, + struct device *dev, + struct resource *disc_res) +{ + struct pci_dev *pci_dev = to_pci_dev(dev->parent); + u8 bir; + + /* + * The base offset should always be 8 byte aligned. + * + * For non-local access types the lower 3 bits of base offset + * contains the index of the base address register where the + * telemetry can be found. + */ + bir = GET_BIR(header->base_offset); + + /* Local access and BARID only for now */ + switch (header->access_type) { + case ACCESS_LOCAL: + if (bir) { + dev_err(dev, + "Unsupported BAR index %d for access type %d\n", + bir, header->access_type); + return -EINVAL; + } + /* + * For access_type LOCAL, the base address is as follows: + * base address = end of discovery region + base offset + */ + entry->base_addr = disc_res->end + 1 + header->base_offset; + + /* + * Some hardware use a different calculation for the base address + * when access_type == ACCESS_LOCAL. On the these systems + * ACCCESS_LOCAL refers to an address in the same BAR as the + * header but at a fixed offset. But as the header address was + * supplied to the driver, we don't know which BAR it was in. + * So search for the bar whose range includes the header address. + */ + if (intel_pmt_is_early_client_hw(dev)) { + int i; + + entry->base_addr = 0; + for (i = 0; i < 6; i++) + if (disc_res->start >= pci_resource_start(pci_dev, i) && + (disc_res->start <= pci_resource_end(pci_dev, i))) { + entry->base_addr = pci_resource_start(pci_dev, i) + + header->base_offset; + break; + } + if (!entry->base_addr) + return -EINVAL; + } + + break; + case ACCESS_BARID: + /* + * If another BAR was specified then the base offset + * represents the offset within that BAR. SO retrieve the + * address from the parent PCI device and add offset. + */ + entry->base_addr = pci_resource_start(pci_dev, bir) + + GET_ADDRESS(header->base_offset); + break; + default: + dev_err(dev, "Unsupported access type %d\n", + header->access_type); + return -EINVAL; + } + + entry->guid = header->guid; + entry->size = header->size; + + return 0; +} + +static int intel_pmt_dev_register(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns, + struct device *parent) +{ + struct resource res = {0}; + struct device *dev; + int ret; + + ret = xa_alloc(ns->xa, &entry->devid, entry, PMT_XA_LIMIT, GFP_KERNEL); + if (ret) + return ret; + + dev = device_create(&intel_pmt_class, parent, MKDEV(0, 0), entry, + "%s%d", ns->name, entry->devid); + + if (IS_ERR(dev)) { + dev_err(parent, "Could not create %s%d device node\n", + ns->name, entry->devid); + ret = PTR_ERR(dev); + goto fail_dev_create; + } + + entry->kobj = &dev->kobj; + + if (ns->attr_grp) { + ret = sysfs_create_group(entry->kobj, ns->attr_grp); + if (ret) + goto fail_sysfs; + } + + /* if size is 0 assume no data buffer, so no file needed */ + if (!entry->size) + return 0; + + res.start = entry->base_addr; + res.end = res.start + entry->size - 1; + res.flags = IORESOURCE_MEM; + + entry->base = devm_ioremap_resource(dev, &res); + if (IS_ERR(entry->base)) { + ret = PTR_ERR(entry->base); + goto fail_ioremap; + } + + sysfs_bin_attr_init(&entry->pmt_bin_attr); + entry->pmt_bin_attr.attr.name = ns->name; + entry->pmt_bin_attr.attr.mode = 0440; + entry->pmt_bin_attr.mmap = intel_pmt_mmap; + entry->pmt_bin_attr.read = intel_pmt_read; + entry->pmt_bin_attr.size = entry->size; + + ret = sysfs_create_bin_file(&dev->kobj, &entry->pmt_bin_attr); + if (!ret) + return 0; + +fail_ioremap: + if (ns->attr_grp) + sysfs_remove_group(entry->kobj, ns->attr_grp); +fail_sysfs: + device_unregister(dev); +fail_dev_create: + xa_erase(ns->xa, entry->devid); + + return ret; +} + +int intel_pmt_dev_create(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns, + struct platform_device *pdev, int idx) +{ + struct intel_pmt_header header; + struct resource *disc_res; + int ret = -ENODEV; + + disc_res = platform_get_resource(pdev, IORESOURCE_MEM, idx); + if (!disc_res) + return ret; + + entry->disc_table = devm_platform_ioremap_resource(pdev, idx); + if (IS_ERR(entry->disc_table)) + return PTR_ERR(entry->disc_table); + + ret = ns->pmt_header_decode(entry, &header, &pdev->dev); + if (ret) + return ret; + + ret = intel_pmt_populate_entry(entry, &header, &pdev->dev, disc_res); + if (ret) + return ret; + + return intel_pmt_dev_register(entry, ns, &pdev->dev); + +} +EXPORT_SYMBOL_GPL(intel_pmt_dev_create); + +void intel_pmt_dev_destroy(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns) +{ + struct device *dev = kobj_to_dev(entry->kobj); + + if (entry->size) + sysfs_remove_bin_file(entry->kobj, &entry->pmt_bin_attr); + + if (ns->attr_grp) + sysfs_remove_group(entry->kobj, ns->attr_grp); + + device_unregister(dev); + xa_erase(ns->xa, entry->devid); +} +EXPORT_SYMBOL_GPL(intel_pmt_dev_destroy); + +static int __init pmt_class_init(void) +{ + return class_register(&intel_pmt_class); +} + +static void __exit pmt_class_exit(void) +{ + class_unregister(&intel_pmt_class); +} + +module_init(pmt_class_init); +module_exit(pmt_class_exit); + +MODULE_AUTHOR("Alexander Duyck "); +MODULE_DESCRIPTION("Intel PMT Class driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/intel_pmt_class.h b/drivers/platform/x86/intel_pmt_class.h new file mode 100644 index 0000000000000000000000000000000000000000..1337019c2873eb37b3abda073700228437bb0b36 --- /dev/null +++ b/drivers/platform/x86/intel_pmt_class.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _INTEL_PMT_CLASS_H +#define _INTEL_PMT_CLASS_H + +#include +#include +#include +#include +#include +#include + +/* PMT access types */ +#define ACCESS_BARID 2 +#define ACCESS_LOCAL 3 + +/* PMT discovery base address/offset register layout */ +#define GET_BIR(v) ((v) & GENMASK(2, 0)) +#define GET_ADDRESS(v) ((v) & GENMASK(31, 3)) + +struct intel_pmt_entry { + struct bin_attribute pmt_bin_attr; + struct kobject *kobj; + void __iomem *disc_table; + void __iomem *base; + unsigned long base_addr; + size_t size; + u32 guid; + int devid; +}; + +struct intel_pmt_header { + u32 base_offset; + u32 size; + u32 guid; + u8 access_type; +}; + +struct intel_pmt_namespace { + const char *name; + struct xarray *xa; + const struct attribute_group *attr_grp; + int (*pmt_header_decode)(struct intel_pmt_entry *entry, + struct intel_pmt_header *header, + struct device *dev); +}; + +bool intel_pmt_is_early_client_hw(struct device *dev); +int intel_pmt_dev_create(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns, + struct platform_device *pdev, int idx); +void intel_pmt_dev_destroy(struct intel_pmt_entry *entry, + struct intel_pmt_namespace *ns); +#endif diff --git a/drivers/platform/x86/intel_pmt_crashlog.c b/drivers/platform/x86/intel_pmt_crashlog.c new file mode 100644 index 0000000000000000000000000000000000000000..56963ceb6345f88408be9d686ac7f1c3fa057251 --- /dev/null +++ b/drivers/platform/x86/intel_pmt_crashlog.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Platform Monitoring Technology Crashlog driver + * + * Copyright (c) 2020, Intel Corporation. + * All Rights Reserved. + * + * Author: "Alexander Duyck" + */ + +#include +#include +#include +#include +#include +#include + +#include "intel_pmt_class.h" + +#define DRV_NAME "pmt_crashlog" + +/* Crashlog discovery header types */ +#define CRASH_TYPE_OOBMSM 1 + +/* Control Flags */ +#define CRASHLOG_FLAG_DISABLE BIT(28) + +/* + * Bits 29 and 30 control the state of bit 31. + * + * Bit 29 will clear bit 31, if set, allowing a new crashlog to be captured. + * Bit 30 will immediately trigger a crashlog to be generated, setting bit 31. + * Bit 31 is the read-only status with a 1 indicating log is complete. + */ +#define CRASHLOG_FLAG_TRIGGER_CLEAR BIT(29) +#define CRASHLOG_FLAG_TRIGGER_EXECUTE BIT(30) +#define CRASHLOG_FLAG_TRIGGER_COMPLETE BIT(31) +#define CRASHLOG_FLAG_TRIGGER_MASK GENMASK(31, 28) + +/* Crashlog Discovery Header */ +#define CONTROL_OFFSET 0x0 +#define GUID_OFFSET 0x4 +#define BASE_OFFSET 0x8 +#define SIZE_OFFSET 0xC +#define GET_ACCESS(v) ((v) & GENMASK(3, 0)) +#define GET_TYPE(v) (((v) & GENMASK(7, 4)) >> 4) +#define GET_VERSION(v) (((v) & GENMASK(19, 16)) >> 16) +/* size is in bytes */ +#define GET_SIZE(v) ((v) * sizeof(u32)) + +struct crashlog_entry { + /* entry must be first member of struct */ + struct intel_pmt_entry entry; + struct mutex control_mutex; +}; + +struct pmt_crashlog_priv { + int num_entries; + struct crashlog_entry entry[]; +}; + +/* + * I/O + */ +static bool pmt_crashlog_complete(struct intel_pmt_entry *entry) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + /* return current value of the crashlog complete flag */ + return !!(control & CRASHLOG_FLAG_TRIGGER_COMPLETE); +} + +static bool pmt_crashlog_disabled(struct intel_pmt_entry *entry) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + /* return current value of the crashlog disabled flag */ + return !!(control & CRASHLOG_FLAG_DISABLE); +} + +static bool pmt_crashlog_supported(struct intel_pmt_entry *entry) +{ + u32 discovery_header = readl(entry->disc_table + CONTROL_OFFSET); + u32 crash_type, version; + + crash_type = GET_TYPE(discovery_header); + version = GET_VERSION(discovery_header); + + /* + * Currently we only recognize OOBMSM version 0 devices. + * We can ignore all other crashlog devices in the system. + */ + return crash_type == CRASH_TYPE_OOBMSM && version == 0; +} + +static void pmt_crashlog_set_disable(struct intel_pmt_entry *entry, + bool disable) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + /* clear trigger bits so we are only modifying disable flag */ + control &= ~CRASHLOG_FLAG_TRIGGER_MASK; + + if (disable) + control |= CRASHLOG_FLAG_DISABLE; + else + control &= ~CRASHLOG_FLAG_DISABLE; + + writel(control, entry->disc_table + CONTROL_OFFSET); +} + +static void pmt_crashlog_set_clear(struct intel_pmt_entry *entry) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + control &= ~CRASHLOG_FLAG_TRIGGER_MASK; + control |= CRASHLOG_FLAG_TRIGGER_CLEAR; + + writel(control, entry->disc_table + CONTROL_OFFSET); +} + +static void pmt_crashlog_set_execute(struct intel_pmt_entry *entry) +{ + u32 control = readl(entry->disc_table + CONTROL_OFFSET); + + control &= ~CRASHLOG_FLAG_TRIGGER_MASK; + control |= CRASHLOG_FLAG_TRIGGER_EXECUTE; + + writel(control, entry->disc_table + CONTROL_OFFSET); +} + +/* + * sysfs + */ +static ssize_t +enable_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct intel_pmt_entry *entry = dev_get_drvdata(dev); + int enabled = !pmt_crashlog_disabled(entry); + + return sprintf(buf, "%d\n", enabled); +} + +static ssize_t +enable_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct crashlog_entry *entry; + bool enabled; + int result; + + entry = dev_get_drvdata(dev); + + result = kstrtobool(buf, &enabled); + if (result) + return result; + + mutex_lock(&entry->control_mutex); + pmt_crashlog_set_disable(&entry->entry, !enabled); + mutex_unlock(&entry->control_mutex); + + return count; +} +static DEVICE_ATTR_RW(enable); + +static ssize_t +trigger_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct intel_pmt_entry *entry; + int trigger; + + entry = dev_get_drvdata(dev); + trigger = pmt_crashlog_complete(entry); + + return sprintf(buf, "%d\n", trigger); +} + +static ssize_t +trigger_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct crashlog_entry *entry; + bool trigger; + int result; + + entry = dev_get_drvdata(dev); + + result = kstrtobool(buf, &trigger); + if (result) + return result; + + mutex_lock(&entry->control_mutex); + + if (!trigger) { + pmt_crashlog_set_clear(&entry->entry); + } else if (pmt_crashlog_complete(&entry->entry)) { + /* we cannot trigger a new crash if one is still pending */ + result = -EEXIST; + goto err; + } else if (pmt_crashlog_disabled(&entry->entry)) { + /* if device is currently disabled, return busy */ + result = -EBUSY; + goto err; + } else { + pmt_crashlog_set_execute(&entry->entry); + } + + result = count; +err: + mutex_unlock(&entry->control_mutex); + return result; +} +static DEVICE_ATTR_RW(trigger); + +static struct attribute *pmt_crashlog_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_trigger.attr, + NULL +}; + +static const struct attribute_group pmt_crashlog_group = { + .attrs = pmt_crashlog_attrs, +}; + +static int pmt_crashlog_header_decode(struct intel_pmt_entry *entry, + struct intel_pmt_header *header, + struct device *dev) +{ + void __iomem *disc_table = entry->disc_table; + struct crashlog_entry *crashlog; + + if (!pmt_crashlog_supported(entry)) + return 1; + + /* initialize control mutex */ + crashlog = container_of(entry, struct crashlog_entry, entry); + mutex_init(&crashlog->control_mutex); + + header->access_type = GET_ACCESS(readl(disc_table)); + header->guid = readl(disc_table + GUID_OFFSET); + header->base_offset = readl(disc_table + BASE_OFFSET); + + /* Size is measured in DWORDS, but accessor returns bytes */ + header->size = GET_SIZE(readl(disc_table + SIZE_OFFSET)); + + return 0; +} + +static DEFINE_XARRAY_ALLOC(crashlog_array); +static struct intel_pmt_namespace pmt_crashlog_ns = { + .name = "crashlog", + .xa = &crashlog_array, + .attr_grp = &pmt_crashlog_group, + .pmt_header_decode = pmt_crashlog_header_decode, +}; + +/* + * initialization + */ +static int pmt_crashlog_remove(struct platform_device *pdev) +{ + struct pmt_crashlog_priv *priv = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < priv->num_entries; i++) + intel_pmt_dev_destroy(&priv->entry[i].entry, &pmt_crashlog_ns); + + return 0; +} + +static int pmt_crashlog_probe(struct platform_device *pdev) +{ + struct pmt_crashlog_priv *priv; + size_t size; + int i, ret; + + size = struct_size(priv, entry, pdev->num_resources); + priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL); + if (!priv) + return -ENOMEM; + + platform_set_drvdata(pdev, priv); + + for (i = 0; i < pdev->num_resources; i++) { + struct intel_pmt_entry *entry = &priv->entry[i].entry; + + ret = intel_pmt_dev_create(entry, &pmt_crashlog_ns, pdev, i); + if (ret < 0) + goto abort_probe; + if (ret) + continue; + + priv->num_entries++; + } + + return 0; +abort_probe: + pmt_crashlog_remove(pdev); + return ret; +} + +static struct platform_driver pmt_crashlog_driver = { + .driver = { + .name = DRV_NAME, + }, + .remove = pmt_crashlog_remove, + .probe = pmt_crashlog_probe, +}; + +static int __init pmt_crashlog_init(void) +{ + return platform_driver_register(&pmt_crashlog_driver); +} + +static void __exit pmt_crashlog_exit(void) +{ + platform_driver_unregister(&pmt_crashlog_driver); + xa_destroy(&crashlog_array); +} + +module_init(pmt_crashlog_init); +module_exit(pmt_crashlog_exit); + +MODULE_AUTHOR("Alexander Duyck "); +MODULE_DESCRIPTION("Intel PMT Crashlog driver"); +MODULE_ALIAS("platform:" DRV_NAME); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/intel_pmt_telemetry.c b/drivers/platform/x86/intel_pmt_telemetry.c new file mode 100644 index 0000000000000000000000000000000000000000..9f845e70a1f84f9d54fbf07896a786ecc74b7307 --- /dev/null +++ b/drivers/platform/x86/intel_pmt_telemetry.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Platform Monitory Technology Telemetry driver + * + * Copyright (c) 2020, Intel Corporation. + * All Rights Reserved. + * + * Author: "David E. Box" + */ + +#include +#include +#include +#include +#include +#include + +#include "intel_pmt_class.h" + +#define TELEM_DEV_NAME "pmt_telemetry" + +#define TELEM_SIZE_OFFSET 0x0 +#define TELEM_GUID_OFFSET 0x4 +#define TELEM_BASE_OFFSET 0x8 +#define TELEM_ACCESS(v) ((v) & GENMASK(3, 0)) +/* size is in bytes */ +#define TELEM_SIZE(v) (((v) & GENMASK(27, 12)) >> 10) + +/* Used by client hardware to identify a fixed telemetry entry*/ +#define TELEM_CLIENT_FIXED_BLOCK_GUID 0x10000000 + +struct pmt_telem_priv { + int num_entries; + struct intel_pmt_entry entry[]; +}; + +static bool pmt_telem_region_overlaps(struct intel_pmt_entry *entry, + struct device *dev) +{ + u32 guid = readl(entry->disc_table + TELEM_GUID_OFFSET); + + if (guid != TELEM_CLIENT_FIXED_BLOCK_GUID) + return false; + + return intel_pmt_is_early_client_hw(dev); +} + +static int pmt_telem_header_decode(struct intel_pmt_entry *entry, + struct intel_pmt_header *header, + struct device *dev) +{ + void __iomem *disc_table = entry->disc_table; + + if (pmt_telem_region_overlaps(entry, dev)) + return 1; + + header->access_type = TELEM_ACCESS(readl(disc_table)); + header->guid = readl(disc_table + TELEM_GUID_OFFSET); + header->base_offset = readl(disc_table + TELEM_BASE_OFFSET); + + /* Size is measured in DWORDS, but accessor returns bytes */ + header->size = TELEM_SIZE(readl(disc_table)); + + /* + * Some devices may expose non-functioning entries that are + * reserved for future use. They have zero size. Do not fail + * probe for these. Just ignore them. + */ + if (header->size == 0) + return 1; + + return 0; +} + +static DEFINE_XARRAY_ALLOC(telem_array); +static struct intel_pmt_namespace pmt_telem_ns = { + .name = "telem", + .xa = &telem_array, + .pmt_header_decode = pmt_telem_header_decode, +}; + +static int pmt_telem_remove(struct platform_device *pdev) +{ + struct pmt_telem_priv *priv = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < priv->num_entries; i++) + intel_pmt_dev_destroy(&priv->entry[i], &pmt_telem_ns); + + return 0; +} + +static int pmt_telem_probe(struct platform_device *pdev) +{ + struct pmt_telem_priv *priv; + size_t size; + int i, ret; + + size = struct_size(priv, entry, pdev->num_resources); + priv = devm_kzalloc(&pdev->dev, size, GFP_KERNEL); + if (!priv) + return -ENOMEM; + + platform_set_drvdata(pdev, priv); + + for (i = 0; i < pdev->num_resources; i++) { + struct intel_pmt_entry *entry = &priv->entry[i]; + + ret = intel_pmt_dev_create(entry, &pmt_telem_ns, pdev, i); + if (ret < 0) + goto abort_probe; + if (ret) + continue; + + priv->num_entries++; + } + + return 0; +abort_probe: + pmt_telem_remove(pdev); + return ret; +} + +static struct platform_driver pmt_telem_driver = { + .driver = { + .name = TELEM_DEV_NAME, + }, + .remove = pmt_telem_remove, + .probe = pmt_telem_probe, +}; + +static int __init pmt_telem_init(void) +{ + return platform_driver_register(&pmt_telem_driver); +} +module_init(pmt_telem_init); + +static void __exit pmt_telem_exit(void) +{ + platform_driver_unregister(&pmt_telem_driver); + xa_destroy(&telem_array); +} +module_exit(pmt_telem_exit); + +MODULE_AUTHOR("David E. Box "); +MODULE_DESCRIPTION("Intel PMT Telemetry driver"); +MODULE_ALIAS("platform:" TELEM_DEV_NAME); +MODULE_LICENSE("GPL v2"); diff --git a/find_commits_in_repo.sh b/find_commits_in_repo.sh new file mode 100755 index 0000000000000000000000000000000000000000..6527d10ae53e768d22bf73c1988b6a55c5de3d0b --- /dev/null +++ b/find_commits_in_repo.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +#set -x + +PATCH_DIR=/local/work/SGX/SGX-host-virt-Backport-UniCloud-20220317 +PATCH_DIR_MISS="${PATCH_DIR}.miss" +REPO_DIR="$(pwd)" + +function copy_missing_patches() +{ + local missing_commits=$1 + + rm -rf ${PATCH_DIR_MISS} + mkdir -p ${PATCH_DIR_MISS} + + OLDIFS=$IFS + IFS=$'\n' + for i in $(echo -e "$missing_commits"); do + local p=$(echo $i | cut -d':' -f1) + cp ${PATCH_DIR}/${p} ${PATCH_DIR_MISS} + done + IFS=$OLDIFS +} + +function main() +{ + local line="" + local missing_commits="" + local commit_list="$(grep -o "^commit .\+ upstream" ${PATCH_DIR}/*)" + local total_commit_nr=0 + local find_commit_nr=0 + + cd $REPO_DIR + echo "Find commits:" + echo "----------------------------------------+------------+--------------------" + echo "Old Commit | New Commit | Subject" + echo "----------------------------------------|------------|--------------------" + OLDIFS=$IFS + IFS=$'\n' + for i in $commit_list; do + local c=$(echo $i | cut -f2 -d" ") + (( ++total_commit_nr )) + if git merge-base --is-ancestor $c HEAD 2>/dev/null ; then + # Current branch contains the commit directly + line="$(git log $c -1 --oneline)" + else + # The commit might have been backported to current branch + line="$(git log --grep=$c --oneline)" + fi + if [ -n "$line" ]; then + (( ++find_commit_nr )) + # Replace the first space to "|" + line=${line/ /|} + echo "$c|$line" + else + missing_commits="${missing_commits}\n$(basename ${i})" + fi + done + IFS=$OLDIFS + + local missing_commit_nr=$(( $total_commit_nr - $find_commit_nr )) + echo -e "\nMissing commits (${missing_commit_nr}/${total_commit_nr}):" + echo -n "-------------------------------" + echo -e "$missing_commits" + + read -p "Do you want to copy the missing patches to [${PATCH_DIR_MISS}]? [y/N]" answer + if [ X"$answer" == X"y" ]; then + copy_missing_patches "$missing_commits" + else + echo "Do nothing" + fi +} + +main "$*" diff --git a/include/linux/mm.h b/include/linux/mm.h index a886f48b6a0e293aa734a8242732e2300c1e8dcb..b0ddfac0486b7da51f8e97a0e75dae6e3343f251 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2712,17 +2712,45 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +static inline +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) { - struct vm_area_struct * vma = find_vma(mm,start_addr); + struct vm_area_struct *vma = find_vma(mm, start_addr); if (vma && end_addr <= vma->vm_start) vma = NULL; return vma; } +/** + * vma_lookup() - Find a VMA at a specific address + * @mm: The process address space. + * @addr: The user address. + * + * Return: The vm_area_struct at the given address, %NULL otherwise. + */ +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && addr < vma->vm_start) + vma = NULL; + + return vma; +} + static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { unsigned long vm_start = vma->vm_start; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3af8b0164f1e6b2fa07469778aa31f47d1e39645..4e0ebd66368fbeb804aa74fd3d2a015b4c770f29 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1061,6 +1061,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_X86_USER_SPACE_MSR 188 #define KVM_CAP_X86_MSR_FILTER 189 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 +#define KVM_CAP_SGX_ATTRIBUTE 196 #define KVM_CAP_ARM_CPU_FEATURE 555 diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 7e0d526dd96f3308aa33038b41280a06956e68c2..9e257fe9efa536e05044608d5718580be730d80b 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -729,6 +729,7 @@ #define PCI_EXT_CAP_ID_DPC 0x1D /* Downstream Port Containment */ #define PCI_EXT_CAP_ID_L1SS 0x1E /* L1 PM Substates */ #define PCI_EXT_CAP_ID_PTM 0x1F /* Precision Time Measurement */ +#define PCI_EXT_CAP_ID_DVSEC 0x23 /* Designated Vendor-Specific */ #define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ #define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_16GT @@ -1079,6 +1080,10 @@ #define PCI_L1SS_CTL1_LTR_L12_TH_SCALE 0xe0000000 /* LTR_L1.2_THRESHOLD_Scale */ #define PCI_L1SS_CTL2 0x0c /* Control 2 Register */ +/* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */ +#define PCI_DVSEC_HEADER1 0x4 /* Designated Vendor-Specific Header1 */ +#define PCI_DVSEC_HEADER2 0x8 /* Designated Vendor-Specific Header2 */ + /* Data Link Feature */ #define PCI_DLF_CAP 0x04 /* Capabilities Register */ #define PCI_DLF_EXCHANGE_ENABLE 0x80000000 /* Data Link Feature Exchange Enable */