diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 00000000000000..78a085327976cb --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,41 @@ +# This CI will only work for trusted contributors. CI for public contributors +# runs via a webhook on the merge requests. There's nothing you have to do if +# you want your changes tested -- created pipelines will be automatically +# linked in the merge request and appropriate labels will be added to it. +# Changes to this file will NOT be reflected in the testing. + +workflow: + rules: + - if: $CI_MERGE_REQUEST_ID + +trigger_pipeline: + variables: + git_url: ${CI_MERGE_REQUEST_PROJECT_URL} + branch: ${CI_MERGE_REQUEST_TARGET_BRANCH_NAME} + commit_hash: ${CI_COMMIT_SHA} + name: kernel-rhel8 + mr_id: ${CI_MERGE_REQUEST_IID} + mr_url: ${CI_MERGE_REQUEST_PROJECT_URL}/-/merge_requests/${CI_MERGE_REQUEST_IID} + title: ${CI_COMMIT_TITLE} + + send_ready_for_test_pre: 'True' + send_ready_for_test_post: 'True' + kernel_type: internal + make_target: rpm + builder_image: quay.io/cki/builder-rhel8 + architectures: 'x86_64 ppc64le aarch64 s390x' + build_kabi_whitelist: 'true' + tree_yaml_name: rhel + kpet_tree_family: rhel8 + package_name: kernel + publish_elsewhere: 'true' + native_tools: 'true' + # Skip testing for now to save resources. Uncomment ystream_distro and remove + # the skip_ beaker line when ready for production. + #skip_beaker: 'true' + disttag_override: '.el8_3' + trigger: + project: redhat/red-hat-ci-tools/kernel/cki-internal-pipelines/cki-internal-contributors + branch: rhel8 + strategy: depend + diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 4e9443d6764245..3722c5c0fd93b3 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1260,6 +1260,9 @@ field userspace_addr, which must point at user addressable memory for the entire memory slot size. Any object may back this memory, including anonymous memory, ordinary files, and hugetlbfs. +On architectures that support a form of address tagging, userspace_addr must +be an untagged address. + It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr be identical. This allows large pages in the guest to be backed by large pages in the host. diff --git a/Makefile.rhelver b/Makefile.rhelver index 894cfc81a882e8..af64165a5ccc29 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 3 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 240.15.1 +RHEL_RELEASE = 240.22.1 # # Early y+1 numbering diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 7647c5debd810d..1cbeecd96565e4 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -498,7 +498,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), - MEMMAP_EARLY, NULL); + MEMINIT_EARLY, NULL); return 0; } @@ -507,8 +507,8 @@ memmap_init (unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { if (!vmem_map) { - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, - NULL); + memmap_init_zone(size, nid, zone, start_pfn, + MEMINIT_EARLY, NULL); } else { struct page *start; struct memmap_init_callback_data args; diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c index dd95cdbd22ce8a..1ddc3a861df172 100644 --- a/arch/s390/crypto/arch_random.c +++ b/arch/s390/crypto/arch_random.c @@ -2,7 +2,7 @@ /* * s390 arch random implementation. * - * Copyright IBM Corp. 2017, 2018 + * Copyright IBM Corp. 2017, 2020 * Author(s): Harald Freudenberger * * The s390_arch_random_generate() function may be called from random.c @@ -33,9 +33,11 @@ #include #include #include +#include #include DEFINE_STATIC_KEY_FALSE(s390_arch_random_available); +EXPORT_SYMBOL(s390_arch_random_available); atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0); EXPORT_SYMBOL(s390_arch_random_counter); @@ -99,6 +101,113 @@ static void arch_rng_refill_buffer(struct work_struct *unused) queue_delayed_work(system_long_wq, &arch_rng_work, delay); } +/* + * Here follows the implementation of s390_arch_get_random_long(). + * + * The random longs to be pulled by arch_get_random_long() are + * prepared in an 4K buffer which is filled from the NIST 800-90 + * compliant s390 drbg. By default the random long buffer is refilled + * 256 times before the drbg itself needs a reseed. The reseed of the + * drbg is done with 64 bytes fetched from the high quality (but slow) + * trng which is assumed to deliver 100% entropy. So the 64 * 8 = 512 + * bits of entropy are spread over 256 * 4KB = 1MB serving 131072 + * arch_get_random_long() invocations before reseeded. + * + * How often the 4K random long buffer is refilled with the drbg + * before the drbg is reseeded can be adjusted. There is a module + * parameter 's390_arch_rnd_long_drbg_reseed' accessible via + * /sys/module/arch_random/parameters/rndlong_drbg_reseed + * or as kernel command line parameter + * arch_random.rndlong_drbg_reseed= + * This parameter tells how often the drbg fills the 4K buffer before + * it is re-seeded by fresh entropy from the trng. + * A value of 16 results in reseeding the drbg at every 16 * 4 KB = 64 + * KB with 32 bytes of fresh entropy pulled from the trng. So a value + * of 16 would result in 512 bits entropy per 64 KB. + * A value of 256 results in 1MB of drbg output before a reseed of the + * drbg is done. So this would spread the 512 bits of entropy among 1MB. + * Setting this parameter to 0 forces the reseed to take place every + * time the 4K buffer is depleted, so the entropy rises to 512 bits + * entropy per 4K or 1 bit entropy per arch_get_random_long(). With + * setting this parameter to negative values all this effort is + * disabled, arch_get_random long() returns false and thus indicating + * that the arch_get_random_long() feature is disabled at all. + */ + +static unsigned long rndlong_buf[512]; +static DEFINE_SPINLOCK(rndlong_lock); +static int rndlong_buf_index; + +static int rndlong_drbg_reseed = 256; +module_param_named(rndlong_drbg_reseed, rndlong_drbg_reseed, int, 0600); +MODULE_PARM_DESC(rndlong_drbg_reseed, "s390 arch_get_random_long() drbg reseed"); + +static inline void refill_rndlong_buf(void) +{ + static u8 prng_ws[240]; + static int drbg_counter; + + if (--drbg_counter < 0) { + /* need to re-seed the drbg */ + u8 seed[64]; + + /* fetch seed from trng */ + cpacf_trng(NULL, 0, seed, sizeof(seed)); + /* seed drbg */ + memset(prng_ws, 0, sizeof(prng_ws)); + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, + &prng_ws, NULL, 0, seed, sizeof(seed)); + /* re-init counter for drbg */ + drbg_counter = rndlong_drbg_reseed; + } + + /* fill the arch_get_random_long buffer from drbg */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, &prng_ws, + (u8 *) rndlong_buf, sizeof(rndlong_buf), + NULL, 0); +} + +bool s390_arch_get_random_long(unsigned long *v) +{ + bool rc = false; + unsigned long flags; + + /* arch_get_random_long() disabled ? */ + if (rndlong_drbg_reseed < 0) + return false; + + /* try to lock the random long lock */ + if (!spin_trylock_irqsave(&rndlong_lock, flags)) + return false; + + if (--rndlong_buf_index >= 0) { + /* deliver next long value from the buffer */ + *v = rndlong_buf[rndlong_buf_index]; + rc = true; + goto out; + } + + /* buffer is depleted and needs refill */ + if (in_interrupt()) { + /* delay refill in interrupt context to next caller */ + rndlong_buf_index = 0; + goto out; + } + + /* refill random long buffer */ + refill_rndlong_buf(); + rndlong_buf_index = ARRAY_SIZE(rndlong_buf); + + /* and provide one random long */ + *v = rndlong_buf[--rndlong_buf_index]; + rc = true; + +out: + spin_unlock_irqrestore(&rndlong_lock, flags); + return rc; +} +EXPORT_SYMBOL(s390_arch_get_random_long); + static int __init s390_arch_random_init(void) { /* all the needed PRNO subfunctions available ? */ diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index c67b82dfa558ee..0b64e1dbcb640a 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -2,7 +2,7 @@ /* * Kernel interface for the s390 arch_random_* functions * - * Copyright IBM Corp. 2017 + * Copyright IBM Corp. 2017, 2020 * * Author: Harald Freudenberger * @@ -19,6 +19,7 @@ DECLARE_STATIC_KEY_FALSE(s390_arch_random_available); extern atomic64_t s390_arch_random_counter; +bool s390_arch_get_random_long(unsigned long *v); bool s390_arch_random_generate(u8 *buf, unsigned int nbytes); static inline bool arch_has_random(void) @@ -35,6 +36,8 @@ static inline bool arch_has_random_seed(void) static inline bool arch_get_random_long(unsigned long *v) { + if (static_branch_likely(&s390_arch_random_available)) + return s390_arch_get_random_long(v); return false; } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index cfe02135ba2291..0916e0e786779f 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -63,10 +63,10 @@ static noinline __init void detect_machine_type(void) if (stsi(vmms, 3, 2, 2) || !vmms->count) return; - /* Running under KVM? If not we assume z/VM */ + /* Detect known hypervisors */ if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; - else + else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) S390_lowcore.machine_flags |= MACHINE_FLAG_VM; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 2e3b2741210efa..b226e2d2675231 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1027,6 +1027,8 @@ void __init setup_arch(char **cmdline_p) pr_info("Linux is running under KVM in 64-bit mode\n"); else if (MACHINE_IS_LPAR) pr_info("Linux is running natively in 64-bit mode\n"); + else + pr_info("Linux is running as a guest in 64-bit mode\n"); log_component_list(); init_lockdown(); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8368a4b6f627b7..d964b0077ce771 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -218,7 +218,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bdcf958b4de8c1..2e9a291feafe9b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -889,7 +889,8 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; - case 0x17: init_amd_zn(c); break; + case 0x17: fallthrough; + case 0x19: init_amd_zn(c); break; } /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d2ce3812633ba7..0f9036a8ff4523 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -244,7 +244,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) goto out; return 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index df211f5de2add1..789bc591160a92 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -255,6 +255,20 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } +static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)); +} + +static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)); +} + static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index d057376bd3d33c..698969e18fe352 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -197,12 +197,9 @@ static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq) /* * If no longer has pending EOI in LAPICs, update - * EOI for this vetor. + * EOI for this vector. */ rtc_irq_eoi(ioapic, vcpu, entry->fields.vector); - kvm_ioapic_update_eoi_one(vcpu, ioapic, - entry->fields.trig_mode, - irq); break; } } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 28f015b419fb1a..975e12e42f9fe4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -43,12 +43,19 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 -static inline u64 rsvd_bits(int s, int e) +static __always_inline u64 rsvd_bits(int s, int e) { + BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s); + + if (__builtin_constant_p(e)) + BUILD_BUG_ON(e > 63); + else + e &= 63; + if (e < s) return 0; - return ((1ULL << (e - s + 1)) - 1) << s; + return ((2ULL << (e - s)) - 1) << s; } void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a41617fb27509f..e77c5da4dc5b03 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -203,6 +203,10 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + + if (WARN_ON(!is_guest_mode(vcpu))) + return true; + if (!nested_svm_vmrun_msrpm(svm)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -574,6 +578,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->nested.vmcb = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -693,6 +699,7 @@ void svm_leave_nested(struct vcpu_svm *svm) leave_guest_mode(&svm->vcpu); copy_vmcb_control_area(&vmcb->control, &hsave->control); nested_svm_uninit_mmu_context(&svm->vcpu); + vmcb_mark_all_dirty(svm->vmcb); } kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); @@ -1135,6 +1142,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * in the registers, the save area of the nested state instead * contains saved L1 state. */ + + svm->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); hsave->save = *save; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 77b0ef428dcc92..fb83e118c8ad34 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -638,8 +638,8 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, * Its safe to read more than we are asked, caller should ensure that * destination has enough space. */ - src_paddr = round_down(src_paddr, 16); offset = src_paddr & 15; + src_paddr = round_down(src_paddr, 16); sz = round_up(sz + offset, 16); return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5ab0ad9bf18cfb..9e5b4dd70f998a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2431,10 +2431,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = svm->spec_ctrl; @@ -2518,10 +2515,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; if (kvm_spec_ctrl_test_value(data)) @@ -2546,12 +2540,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; - if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) + if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index fa81aefb906e42..2bc4e8a74bb8c4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3105,13 +3105,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_host_map *map; - struct page *page; - u64 hpa; /* * hv_evmcs may end up being not mapped after migration (when @@ -3134,6 +3130,17 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } } + return true; +} + +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_host_map *map; + struct page *page; + u64 hpa; + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * Translate L1 physical address to host physical @@ -3202,6 +3209,18 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + + return true; +} + +static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + if (!nested_get_evmcs_page(vcpu)) + return false; + + if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) + return false; + return true; } @@ -4352,6 +4371,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + /* Service the TLB flush request for L2 before switching to L1. */ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); @@ -5992,11 +6013,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { - if (vmx->nested.hv_evmcs) - copy_enlightened_to_vmcs12(vmx); - else if (enable_shadow_vmcs) - copy_shadow_to_vmcs12(vmx); + } else { + copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); + if (!vmx->nested.need_vmcs12_to_shadow_sync) { + if (vmx->nested.hv_evmcs) + copy_enlightened_to_vmcs12(vmx); + else if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + } } BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); @@ -6079,6 +6103,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; + if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) + return -EINVAL; + /* * SMM temporarily disables VMX, so we cannot be in guest mode, * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags @@ -6108,9 +6135,16 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (ret) return ret; - /* Empty 'VMXON' state is permitted */ - if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) - return 0; + /* Empty 'VMXON' state is permitted if no VMCS loaded */ + if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { + /* See vmx_has_valid_vmcs12. */ + if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || + (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || + (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) + return -EINVAL; + else + return 0; + } if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || @@ -6515,7 +6549,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .hv_timer_pending = nested_vmx_preemption_timer_pending, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, - .get_nested_state_pages = nested_get_vmcs12_pages, + .get_nested_state_pages = vmx_get_nested_state_pages, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, }; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 758bccc26cf989..197148d76b8fdb 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -47,6 +47,11 @@ static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_shadow_vmcs12; } +/* + * Note: the same condition is checked against the state provided by userspace + * in vmx_set_nested_state; if it is satisfied, the nested state must include + * the VMCS12. + */ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e1a303fefc1671..858efffebf04cd 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -27,7 +27,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, + [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; /* mapping between fixed pmc index and intel_arch_events array */ @@ -318,7 +318,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, x86_pmu.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -328,6 +330,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, x86_pmu.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, + edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 40d0f38d7e222d..f8148ff62cfd23 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1856,7 +1856,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = to_vmx(vcpu)->spec_ctrl; @@ -2058,7 +2058,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; if (kvm_spec_ctrl_test_value(data)) @@ -2093,12 +2093,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) goto find_shared_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; @@ -6873,11 +6873,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) switch (index) { case MSR_IA32_TSX_CTRL: /* - * No need to pass TSX_CTRL_CPUID_CLEAR through, so - * let's avoid changing CPUID bits under the host - * kernel's feet. + * TSX_CTRL_CPUID_CLEAR is handled in the CPUID + * interception. Keep the host value unchanged to avoid + * changing CPUID bits under the host kernel's feet. + * + * hle=0, rtm=0, tsx_ctrl=1 can be found with some + * combinations of new kernel and old userspace. If + * those guests run on a tsx=off host, do allow guests + * to use TSX_CTRL, but do not change the value on the + * host so that TSX remains always disabled. */ - vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + if (boot_cpu_has(X86_FEATURE_RTM)) + vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + else + vmx->guest_msrs[j].mask = 0; break; default: vmx->guest_msrs[j].mask = -1ull; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc41a5f8b0bd85..fe96cefc6ddb98 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -103,6 +103,7 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void process_smi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); @@ -1354,16 +1355,24 @@ static u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; - /* - * On TAA affected systems: - * - nothing to do if TSX is disabled on the host. - * - we emulate TSX_CTRL if present on the host. - * This lets the guest use VERW to clear CPU buffers. - */ - if (!boot_cpu_has(X86_FEATURE_RTM)) - data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); - else if (!boot_cpu_has_bug(X86_BUG_TAA)) + if (!boot_cpu_has(X86_FEATURE_RTM)) { + /* + * If RTM=0 because the kernel has disabled TSX, the host might + * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 + * and therefore knows that there cannot be TAA) but keep + * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, + * and we want to allow migrating those guests to tsx=off hosts. + */ + data &= ~ARCH_CAP_TAA_NO; + } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { data |= ARCH_CAP_TAA_NO; + } else { + /* + * Nothing to do here; we emulate TSX_CTRL if present on the + * host so the guest can choose between disabling TSX or + * using VERW to clear CPU buffers. + */ + } return data; } @@ -3875,6 +3884,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, { process_nmi(vcpu); + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); + /* * In guest mode, payload delivery should be deferred, * so that the L1 hypervisor can intercept #PF before diff --git a/drivers/base/node.c b/drivers/base/node.c index d75862546c4e6b..140f8720a4e60e 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -748,11 +748,40 @@ static int __ref get_nid_for_pfn(unsigned long pfn) return pfn_to_nid(pfn); } +static void do_register_memory_block_under_node(int nid, + struct memory_block *mem_blk) +{ + int ret; + + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. + */ + mem_blk->nid = nid; + + ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, + &mem_blk->dev.kobj, + kobject_name(&mem_blk->dev.kobj)); + if (ret && ret != -EEXIST) + dev_err_ratelimited(&node_devices[nid]->dev, + "can't create link to %s in sysfs (%d)\n", + kobject_name(&mem_blk->dev.kobj), ret); + + ret = sysfs_create_link_nowarn(&mem_blk->dev.kobj, + &node_devices[nid]->dev.kobj, + kobject_name(&node_devices[nid]->dev.kobj)); + if (ret && ret != -EEXIST) + dev_err_ratelimited(&mem_blk->dev, + "can't create link to %s in sysfs (%d)\n", + kobject_name(&node_devices[nid]->dev.kobj), + ret); +} + /* register memory section under specified node if it spans that node */ -static int register_mem_sect_under_node(struct memory_block *mem_blk, - void *arg) +static int register_mem_block_under_node_early(struct memory_block *mem_blk, + void *arg) { - int ret, nid = *(int *)arg; + int nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn; sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); @@ -772,38 +801,35 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, } /* - * We need to check if page belongs to nid only for the boot - * case, during hotplug we know that all pages in the memory - * block belong to the same node. - */ - if (system_state == SYSTEM_BOOTING) { - page_nid = get_nid_for_pfn(pfn); - if (page_nid < 0) - continue; - if (page_nid != nid) - continue; - } - - /* - * If this memory block spans multiple nodes, we only indicate - * the last processed node. + * We need to check if page belongs to nid only at the boot + * case because node's ranges can be interleaved. */ - mem_blk->nid = nid; - - ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, - &mem_blk->dev.kobj, - kobject_name(&mem_blk->dev.kobj)); - if (ret) - return ret; + page_nid = get_nid_for_pfn(pfn); + if (page_nid < 0) + continue; + if (page_nid != nid) + continue; - return sysfs_create_link_nowarn(&mem_blk->dev.kobj, - &node_devices[nid]->dev.kobj, - kobject_name(&node_devices[nid]->dev.kobj)); + do_register_memory_block_under_node(nid, mem_blk); + return 0; } /* mem section does not span the specified node */ return 0; } +/* + * During hotplug we know that all pages in the memory block belong to the same + * node. + */ +static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, + void *arg) +{ + int nid = *(int *)arg; + + do_register_memory_block_under_node(nid, mem_blk); + return 0; +} + /* * Unregister a memory block device under the node it spans. Memory blocks * with multiple nodes cannot be offlined and therefore also never be removed. @@ -819,11 +845,19 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); } -int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) +void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, + enum meminit_context context) { - return walk_memory_blocks(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), (void *)&nid, - register_mem_sect_under_node); + walk_memory_blocks_func_t func; + + if (context == MEMINIT_HOTPLUG) + func = register_mem_block_under_node_hotplug; + else + func = register_mem_block_under_node_early; + + walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), + (void *)&nid, func); + return; } #ifdef CONFIG_HUGETLBFS diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 378698f5b8cee2..cd308315755453 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -151,6 +151,12 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, dma_addr_t mask; int i; + /* rdma_for_each_block() has a bug if the page size is smaller than the + * page size used to build the umem. For now prevent smaller page sizes + * from being returned. + */ + pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT); + /* At minimum, drivers must support PAGE_SIZE or smaller */ if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0)))) return 0; diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 0c5254cf581aa5..12149e60bc22b4 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1033,8 +1033,8 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) { struct intel_iommu *iommu; u32 ver, sts; - int agaw = 0; - int msagaw = 0; + int agaw = -1; + int msagaw = -1; int err; if (!drhd->reg_base_addr) { @@ -1059,17 +1059,28 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) } err = -EINVAL; - agaw = iommu_calculate_agaw(iommu); - if (agaw < 0) { - pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n", - iommu->seq_id); - goto err_unmap; + if (cap_sagaw(iommu->cap) == 0) { + pr_info("%s: No supported address widths. Not attempting DMA translation.\n", + iommu->name); + drhd->ignored = 1; } - msagaw = iommu_calculate_max_sagaw(iommu); - if (msagaw < 0) { - pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n", - iommu->seq_id); - goto err_unmap; + + if (!drhd->ignored) { + agaw = iommu_calculate_agaw(iommu); + if (agaw < 0) { + pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n", + iommu->seq_id); + drhd->ignored = 1; + } + } + if (!drhd->ignored) { + msagaw = iommu_calculate_max_sagaw(iommu); + if (msagaw < 0) { + pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n", + iommu->seq_id); + drhd->ignored = 1; + agaw = -1; + } } iommu->agaw = agaw; iommu->msagaw = msagaw; @@ -1096,7 +1107,12 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) raw_spin_lock_init(&iommu->register_lock); - if (intel_iommu_enabled) { + /* + * This is only for hotplug; at boot time intel_iommu_enabled won't + * be set yet. When intel_iommu_init() runs, it registers the units + * present at boot time, then sets intel_iommu_enabled. + */ + if (intel_iommu_enabled && !drhd->ignored) { err = iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, "%s", iommu->name); @@ -1111,6 +1127,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) } drhd->iommu = iommu; + iommu->drhd = drhd; return 0; @@ -1125,7 +1142,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) static void free_iommu(struct intel_iommu *iommu) { - if (intel_iommu_enabled) { + if (intel_iommu_enabled && !iommu->drhd->ignored) { iommu_device_unregister(&iommu->iommu); iommu_device_sysfs_remove(&iommu->iommu); } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f9d2e91ce7c42a..918b6f44f85b1e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -369,6 +369,7 @@ static int intel_iommu_strict; static int intel_iommu_superpage = 1; static int iommu_identity_mapping; static int intel_no_bounce; +static int iommu_skip_te_disable; #define IDENTMAP_GFX 2 #define IDENTMAP_AZALIA 4 @@ -1604,6 +1605,10 @@ static void iommu_disable_translation(struct intel_iommu *iommu) u32 sts; unsigned long flag; + if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && + (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) + return; + raw_spin_lock_irqsave(&iommu->register_lock, flag); iommu->gcmd &= ~DMA_GCMD_TE; writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); @@ -3865,6 +3870,7 @@ static void __init init_no_remapping_devices(void) /* This IOMMU has *only* gfx devices. Either bypass it or set the gfx_mapped flag, as appropriate */ + drhd->gfx_dedicated = 1; if (!dmar_map_gfx) { drhd->ignored = 1; for_each_active_dev_scope(drhd->devices, @@ -5777,6 +5783,27 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_g DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); +static void quirk_igfx_skip_te_disable(struct pci_dev *dev) +{ + unsigned short ver; + + if (!IS_GFX_DEVICE(dev)) + return; + + ver = (dev->device >> 8) & 0xff; + if (ver != 0x45 && ver != 0x46 && ver != 0x4c && + ver != 0x4e && ver != 0x8a && ver != 0x98 && + ver != 0x9a) + return; + + if (risky_device(dev)) + return; + + pci_info(dev, "Skip IOMMU disabling for graphics\n"); + iommu_skip_te_disable = 1; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); + /* On Tylersburg chipsets, some BIOSes have been known to enable the ISOCH DMAR unit for the Azalia sound device, but not give it any TLB entries, which causes it to deadlock. Check for that. We do diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index bb930f70d5f98c..3fdc793e8a99e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1094,7 +1094,6 @@ int mlx5e_update_nic_rx(struct mlx5e_priv *priv); void mlx5e_update_carrier(struct mlx5e_priv *priv); int mlx5e_close(struct net_device *netdev); int mlx5e_open(struct net_device *netdev); -void mlx5e_update_ndo_stats(struct mlx5e_priv *priv); void mlx5e_queue_update_stats(struct mlx5e_priv *priv); int mlx5e_bits_invert(unsigned long a, int size); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c index 7cd5b02e0f109a..ab92e8aab95d28 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c @@ -52,7 +52,7 @@ static void mlx5e_monitor_counters_work(struct work_struct *work) monitor_counters_work); mutex_lock(&priv->state_lock); - mlx5e_update_ndo_stats(priv); + mlx5e_stats_update_ndo_stats(priv); mutex_unlock(&priv->state_lock); mlx5e_monitor_counter_arm(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 1f87ed48a4a7af..932b80239dcdc4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -159,16 +159,6 @@ static void mlx5e_update_carrier_work(struct work_struct *work) mutex_unlock(&priv->state_lock); } -void mlx5e_update_ndo_stats(struct mlx5e_priv *priv) -{ - int i; - - for (i = mlx5e_nic_stats_grps_num(priv) - 1; i >= 0; i--) - if (mlx5e_nic_stats_grps[i]->update_stats_mask & - MLX5E_NDO_UPDATE_STATS) - mlx5e_nic_stats_grps[i]->update_stats(priv); -} - static void mlx5e_update_stats_work(struct work_struct *work) { struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, @@ -5229,7 +5219,7 @@ static const struct mlx5e_profile mlx5e_nic_profile = { .enable = mlx5e_nic_enable, .disable = mlx5e_nic_disable, .update_rx = mlx5e_update_nic_rx, - .update_stats = mlx5e_update_ndo_stats, + .update_stats = mlx5e_stats_update_ndo_stats, .update_carrier = mlx5e_update_carrier, .rx_handlers.handle_rx_cqe = mlx5e_handle_rx_cqe, .rx_handlers.handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 23b80085a30415..efc7b94a061146 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1127,7 +1127,7 @@ static const struct mlx5e_profile mlx5e_rep_profile = { .cleanup_tx = mlx5e_cleanup_rep_tx, .enable = mlx5e_rep_enable, .update_rx = mlx5e_update_rep_rx, - .update_stats = mlx5e_update_ndo_stats, + .update_stats = mlx5e_stats_update_ndo_stats, .rx_handlers.handle_rx_cqe = mlx5e_handle_rx_cqe_rep, .rx_handlers.handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq_rep, .max_tc = 1, @@ -1146,7 +1146,7 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { .enable = mlx5e_uplink_rep_enable, .disable = mlx5e_uplink_rep_disable, .update_rx = mlx5e_update_rep_rx, - .update_stats = mlx5e_update_ndo_stats, + .update_stats = mlx5e_stats_update_ndo_stats, .update_carrier = mlx5e_update_carrier, .rx_handlers.handle_rx_cqe = mlx5e_handle_rx_cqe_rep, .rx_handlers.handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq_rep, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 30b216d9284c34..192118a6f64dc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -54,6 +54,18 @@ unsigned int mlx5e_stats_total_num(struct mlx5e_priv *priv) return total; } +void mlx5e_stats_update_ndo_stats(struct mlx5e_priv *priv) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + int i; + + for (i = num_stats_grps - 1; i >= 0; i--) + if (stats_grps[i]->update_stats && + stats_grps[i]->update_stats_mask & MLX5E_NDO_UPDATE_STATS) + stats_grps[i]->update_stats(priv); +} + void mlx5e_stats_update(struct mlx5e_priv *priv) { mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 092b39ffa32a6d..dc2c350845f602 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -103,6 +103,7 @@ unsigned int mlx5e_stats_total_num(struct mlx5e_priv *priv); void mlx5e_stats_update(struct mlx5e_priv *priv); void mlx5e_stats_fill(struct mlx5e_priv *priv, u64 *data, int idx); void mlx5e_stats_fill_strings(struct mlx5e_priv *priv, u8 *data); +void mlx5e_stats_update_ndo_stats(struct mlx5e_priv *priv); /* Concrete NIC Stats */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c index 269eddc3d38bba..ba44bf08c106ea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c @@ -122,6 +122,7 @@ mlx5_eswitch_termtbl_get_create(struct mlx5_eswitch *esw, tt->dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; tt->dest.vport.num = dest->vport.num; tt->dest.vport.vhca_id = dest->vport.vhca_id; + tt->dest.vport.flags = dest->vport.flags; memcpy(&tt->flow_act, flow_act, sizeof(*flow_act)); err = mlx5_eswitch_termtbl_create(esw->dev, tt, flow_act); diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index b60795893994ca..1630fae6af1c1d 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -1401,7 +1401,7 @@ static void fnic_cleanup_io(struct fnic *fnic, int exclude_id) } if (!io_req) { spin_unlock_irqrestore(io_lock, flags); - goto cleanup_scsi_cmd; + continue; } CMD_SP(sc) = NULL; @@ -1416,7 +1416,6 @@ static void fnic_cleanup_io(struct fnic *fnic, int exclude_id) fnic_release_ioreq_buf(fnic, io_req, sc); mempool_free(io_req, fnic->io_req_pool); -cleanup_scsi_cmd: sc->result = DID_TRANSPORT_DISRUPTED << 16; FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, "%s: tag:0x%x : sc:0x%p duration = %lu DID_TRANSPORT_DISRUPTED\n", diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 1e7127ca6aec49..75ed1a74ac7563 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -3331,125 +3331,125 @@ int iscsi_session_get_param(struct iscsi_cls_session *cls_session, switch(param) { case ISCSI_PARAM_FAST_ABORT: - len = sprintf(buf, "%d\n", session->fast_abort); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->fast_abort); break; case ISCSI_PARAM_ABORT_TMO: - len = sprintf(buf, "%d\n", session->abort_timeout); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->abort_timeout); break; case ISCSI_PARAM_LU_RESET_TMO: - len = sprintf(buf, "%d\n", session->lu_reset_timeout); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->lu_reset_timeout); break; case ISCSI_PARAM_TGT_RESET_TMO: - len = sprintf(buf, "%d\n", session->tgt_reset_timeout); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->tgt_reset_timeout); break; case ISCSI_PARAM_INITIAL_R2T_EN: - len = sprintf(buf, "%d\n", session->initial_r2t_en); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->initial_r2t_en); break; case ISCSI_PARAM_MAX_R2T: - len = sprintf(buf, "%hu\n", session->max_r2t); + len = scnprintf(buf, PAGE_SIZE, "%hu\n", session->max_r2t); break; case ISCSI_PARAM_IMM_DATA_EN: - len = sprintf(buf, "%d\n", session->imm_data_en); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->imm_data_en); break; case ISCSI_PARAM_FIRST_BURST: - len = sprintf(buf, "%u\n", session->first_burst); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->first_burst); break; case ISCSI_PARAM_MAX_BURST: - len = sprintf(buf, "%u\n", session->max_burst); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->max_burst); break; case ISCSI_PARAM_PDU_INORDER_EN: - len = sprintf(buf, "%d\n", session->pdu_inorder_en); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->pdu_inorder_en); break; case ISCSI_PARAM_DATASEQ_INORDER_EN: - len = sprintf(buf, "%d\n", session->dataseq_inorder_en); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->dataseq_inorder_en); break; case ISCSI_PARAM_DEF_TASKMGMT_TMO: - len = sprintf(buf, "%d\n", session->def_taskmgmt_tmo); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->def_taskmgmt_tmo); break; case ISCSI_PARAM_ERL: - len = sprintf(buf, "%d\n", session->erl); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->erl); break; case ISCSI_PARAM_TARGET_NAME: - len = sprintf(buf, "%s\n", session->targetname); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->targetname); break; case ISCSI_PARAM_TARGET_ALIAS: - len = sprintf(buf, "%s\n", session->targetalias); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->targetalias); break; case ISCSI_PARAM_TPGT: - len = sprintf(buf, "%d\n", session->tpgt); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->tpgt); break; case ISCSI_PARAM_USERNAME: - len = sprintf(buf, "%s\n", session->username); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->username); break; case ISCSI_PARAM_USERNAME_IN: - len = sprintf(buf, "%s\n", session->username_in); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->username_in); break; case ISCSI_PARAM_PASSWORD: - len = sprintf(buf, "%s\n", session->password); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->password); break; case ISCSI_PARAM_PASSWORD_IN: - len = sprintf(buf, "%s\n", session->password_in); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->password_in); break; case ISCSI_PARAM_IFACE_NAME: - len = sprintf(buf, "%s\n", session->ifacename); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->ifacename); break; case ISCSI_PARAM_INITIATOR_NAME: - len = sprintf(buf, "%s\n", session->initiatorname); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->initiatorname); break; case ISCSI_PARAM_BOOT_ROOT: - len = sprintf(buf, "%s\n", session->boot_root); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->boot_root); break; case ISCSI_PARAM_BOOT_NIC: - len = sprintf(buf, "%s\n", session->boot_nic); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->boot_nic); break; case ISCSI_PARAM_BOOT_TARGET: - len = sprintf(buf, "%s\n", session->boot_target); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->boot_target); break; case ISCSI_PARAM_AUTO_SND_TGT_DISABLE: - len = sprintf(buf, "%u\n", session->auto_snd_tgt_disable); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->auto_snd_tgt_disable); break; case ISCSI_PARAM_DISCOVERY_SESS: - len = sprintf(buf, "%u\n", session->discovery_sess); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->discovery_sess); break; case ISCSI_PARAM_PORTAL_TYPE: - len = sprintf(buf, "%s\n", session->portal_type); + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->portal_type); break; case ISCSI_PARAM_CHAP_AUTH_EN: - len = sprintf(buf, "%u\n", session->chap_auth_en); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->chap_auth_en); break; case ISCSI_PARAM_DISCOVERY_LOGOUT_EN: - len = sprintf(buf, "%u\n", session->discovery_logout_en); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->discovery_logout_en); break; case ISCSI_PARAM_BIDI_CHAP_EN: - len = sprintf(buf, "%u\n", session->bidi_chap_en); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->bidi_chap_en); break; case ISCSI_PARAM_DISCOVERY_AUTH_OPTIONAL: - len = sprintf(buf, "%u\n", session->discovery_auth_optional); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->discovery_auth_optional); break; case ISCSI_PARAM_DEF_TIME2WAIT: - len = sprintf(buf, "%d\n", session->time2wait); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->time2wait); break; case ISCSI_PARAM_DEF_TIME2RETAIN: - len = sprintf(buf, "%d\n", session->time2retain); + len = scnprintf(buf, PAGE_SIZE, "%d\n", session->time2retain); break; case ISCSI_PARAM_TSID: - len = sprintf(buf, "%u\n", session->tsid); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->tsid); break; case ISCSI_PARAM_ISID: - len = sprintf(buf, "%02x%02x%02x%02x%02x%02x\n", + len = scnprintf(buf, PAGE_SIZE, "%02x%02x%02x%02x%02x%02x\n", session->isid[0], session->isid[1], session->isid[2], session->isid[3], session->isid[4], session->isid[5]); break; case ISCSI_PARAM_DISCOVERY_PARENT_IDX: - len = sprintf(buf, "%u\n", session->discovery_parent_idx); + len = scnprintf(buf, PAGE_SIZE, "%u\n", session->discovery_parent_idx); break; case ISCSI_PARAM_DISCOVERY_PARENT_TYPE: if (session->discovery_parent_type) - len = sprintf(buf, "%s\n", + len = scnprintf(buf, PAGE_SIZE, "%s\n", session->discovery_parent_type); else - len = sprintf(buf, "\n"); + len = scnprintf(buf, PAGE_SIZE, "\n"); break; default: return -ENOSYS; @@ -3481,16 +3481,16 @@ int iscsi_conn_get_addr_param(struct sockaddr_storage *addr, case ISCSI_PARAM_CONN_ADDRESS: case ISCSI_HOST_PARAM_IPADDRESS: if (sin) - len = sprintf(buf, "%pI4\n", &sin->sin_addr.s_addr); + len = scnprintf(buf, PAGE_SIZE, "%pI4\n", &sin->sin_addr.s_addr); else - len = sprintf(buf, "%pI6\n", &sin6->sin6_addr); + len = scnprintf(buf, PAGE_SIZE, "%pI6\n", &sin6->sin6_addr); break; case ISCSI_PARAM_CONN_PORT: case ISCSI_PARAM_LOCAL_PORT: if (sin) - len = sprintf(buf, "%hu\n", be16_to_cpu(sin->sin_port)); + len = scnprintf(buf, PAGE_SIZE, "%hu\n", be16_to_cpu(sin->sin_port)); else - len = sprintf(buf, "%hu\n", + len = scnprintf(buf, PAGE_SIZE, "%hu\n", be16_to_cpu(sin6->sin6_port)); break; default: @@ -3509,88 +3509,88 @@ int iscsi_conn_get_param(struct iscsi_cls_conn *cls_conn, switch(param) { case ISCSI_PARAM_PING_TMO: - len = sprintf(buf, "%u\n", conn->ping_timeout); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->ping_timeout); break; case ISCSI_PARAM_RECV_TMO: - len = sprintf(buf, "%u\n", conn->recv_timeout); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->recv_timeout); break; case ISCSI_PARAM_MAX_RECV_DLENGTH: - len = sprintf(buf, "%u\n", conn->max_recv_dlength); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->max_recv_dlength); break; case ISCSI_PARAM_MAX_XMIT_DLENGTH: - len = sprintf(buf, "%u\n", conn->max_xmit_dlength); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->max_xmit_dlength); break; case ISCSI_PARAM_HDRDGST_EN: - len = sprintf(buf, "%d\n", conn->hdrdgst_en); + len = scnprintf(buf, PAGE_SIZE, "%d\n", conn->hdrdgst_en); break; case ISCSI_PARAM_DATADGST_EN: - len = sprintf(buf, "%d\n", conn->datadgst_en); + len = scnprintf(buf, PAGE_SIZE, "%d\n", conn->datadgst_en); break; case ISCSI_PARAM_IFMARKER_EN: - len = sprintf(buf, "%d\n", conn->ifmarker_en); + len = scnprintf(buf, PAGE_SIZE, "%d\n", conn->ifmarker_en); break; case ISCSI_PARAM_OFMARKER_EN: - len = sprintf(buf, "%d\n", conn->ofmarker_en); + len = scnprintf(buf, PAGE_SIZE, "%d\n", conn->ofmarker_en); break; case ISCSI_PARAM_EXP_STATSN: - len = sprintf(buf, "%u\n", conn->exp_statsn); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->exp_statsn); break; case ISCSI_PARAM_PERSISTENT_PORT: - len = sprintf(buf, "%d\n", conn->persistent_port); + len = scnprintf(buf, PAGE_SIZE, "%d\n", conn->persistent_port); break; case ISCSI_PARAM_PERSISTENT_ADDRESS: - len = sprintf(buf, "%s\n", conn->persistent_address); + len = scnprintf(buf, PAGE_SIZE, "%s\n", conn->persistent_address); break; case ISCSI_PARAM_STATSN: - len = sprintf(buf, "%u\n", conn->statsn); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->statsn); break; case ISCSI_PARAM_MAX_SEGMENT_SIZE: - len = sprintf(buf, "%u\n", conn->max_segment_size); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->max_segment_size); break; case ISCSI_PARAM_KEEPALIVE_TMO: - len = sprintf(buf, "%u\n", conn->keepalive_tmo); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->keepalive_tmo); break; case ISCSI_PARAM_LOCAL_PORT: - len = sprintf(buf, "%u\n", conn->local_port); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->local_port); break; case ISCSI_PARAM_TCP_TIMESTAMP_STAT: - len = sprintf(buf, "%u\n", conn->tcp_timestamp_stat); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->tcp_timestamp_stat); break; case ISCSI_PARAM_TCP_NAGLE_DISABLE: - len = sprintf(buf, "%u\n", conn->tcp_nagle_disable); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->tcp_nagle_disable); break; case ISCSI_PARAM_TCP_WSF_DISABLE: - len = sprintf(buf, "%u\n", conn->tcp_wsf_disable); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->tcp_wsf_disable); break; case ISCSI_PARAM_TCP_TIMER_SCALE: - len = sprintf(buf, "%u\n", conn->tcp_timer_scale); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->tcp_timer_scale); break; case ISCSI_PARAM_TCP_TIMESTAMP_EN: - len = sprintf(buf, "%u\n", conn->tcp_timestamp_en); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->tcp_timestamp_en); break; case ISCSI_PARAM_IP_FRAGMENT_DISABLE: - len = sprintf(buf, "%u\n", conn->fragment_disable); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->fragment_disable); break; case ISCSI_PARAM_IPV4_TOS: - len = sprintf(buf, "%u\n", conn->ipv4_tos); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->ipv4_tos); break; case ISCSI_PARAM_IPV6_TC: - len = sprintf(buf, "%u\n", conn->ipv6_traffic_class); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->ipv6_traffic_class); break; case ISCSI_PARAM_IPV6_FLOW_LABEL: - len = sprintf(buf, "%u\n", conn->ipv6_flow_label); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->ipv6_flow_label); break; case ISCSI_PARAM_IS_FW_ASSIGNED_IPV6: - len = sprintf(buf, "%u\n", conn->is_fw_assigned_ipv6); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->is_fw_assigned_ipv6); break; case ISCSI_PARAM_TCP_XMIT_WSF: - len = sprintf(buf, "%u\n", conn->tcp_xmit_wsf); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->tcp_xmit_wsf); break; case ISCSI_PARAM_TCP_RECV_WSF: - len = sprintf(buf, "%u\n", conn->tcp_recv_wsf); + len = scnprintf(buf, PAGE_SIZE, "%u\n", conn->tcp_recv_wsf); break; case ISCSI_PARAM_LOCAL_IPADDR: - len = sprintf(buf, "%s\n", conn->local_ipaddr); + len = scnprintf(buf, PAGE_SIZE, "%s\n", conn->local_ipaddr); break; default: return -ENOSYS; @@ -3608,13 +3608,13 @@ int iscsi_host_get_param(struct Scsi_Host *shost, enum iscsi_host_param param, switch (param) { case ISCSI_HOST_PARAM_NETDEV_NAME: - len = sprintf(buf, "%s\n", ihost->netdev); + len = scnprintf(buf, PAGE_SIZE, "%s\n", ihost->netdev); break; case ISCSI_HOST_PARAM_HWADDRESS: - len = sprintf(buf, "%s\n", ihost->hwaddress); + len = scnprintf(buf, PAGE_SIZE, "%s\n", ihost->hwaddress); break; case ISCSI_HOST_PARAM_INITIATOR_NAME: - len = sprintf(buf, "%s\n", ihost->initiatorname); + len = scnprintf(buf, PAGE_SIZE, "%s\n", ihost->initiatorname); break; default: return -ENOSYS; diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index a8f4ced0fc4172..36ef13ada073a7 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -73,6 +73,7 @@ static void lpfc_disc_timeout_handler(struct lpfc_vport *); static void lpfc_disc_flush_list(struct lpfc_vport *vport); static void lpfc_unregister_fcfi_cmpl(struct lpfc_hba *, LPFC_MBOXQ_t *); static int lpfc_fcf_inuse(struct lpfc_hba *); +static void lpfc_mbx_cmpl_read_sparam(struct lpfc_hba *, LPFC_MBOXQ_t *); void lpfc_terminate_rport_io(struct fc_rport *rport) @@ -1134,11 +1135,13 @@ lpfc_mbx_cmpl_clear_la(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) return; } - void lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) { struct lpfc_vport *vport = pmb->vport; + LPFC_MBOXQ_t *sparam_mb; + struct lpfc_dmabuf *sparam_mp; + int rc; if (pmb->u.mb.mbxStatus) goto out; @@ -1163,12 +1166,42 @@ lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) } /* Start discovery by sending a FLOGI. port_state is identically - * LPFC_FLOGI while waiting for FLOGI cmpl. Check if sending - * the FLOGI is being deferred till after MBX_READ_SPARAM completes. + * LPFC_FLOGI while waiting for FLOGI cmpl. */ if (vport->port_state != LPFC_FLOGI) { - if (!(phba->hba_flag & HBA_DEFER_FLOGI)) + /* Issue MBX_READ_SPARAM to update CSPs before FLOGI if + * bb-credit recovery is in place. + */ + if (phba->bbcredit_support && phba->cfg_enable_bbcr && + !(phba->link_flag & LS_LOOPBACK_MODE)) { + sparam_mb = mempool_alloc(phba->mbox_mem_pool, + GFP_KERNEL); + if (!sparam_mb) + goto sparam_out; + + rc = lpfc_read_sparam(phba, sparam_mb, 0); + if (rc) { + mempool_free(sparam_mb, phba->mbox_mem_pool); + goto sparam_out; + } + sparam_mb->vport = vport; + sparam_mb->mbox_cmpl = lpfc_mbx_cmpl_read_sparam; + rc = lpfc_sli_issue_mbox(phba, sparam_mb, MBX_NOWAIT); + if (rc == MBX_NOT_FINISHED) { + sparam_mp = (struct lpfc_dmabuf *) + sparam_mb->ctx_buf; + lpfc_mbuf_free(phba, sparam_mp->virt, + sparam_mp->phys); + kfree(sparam_mp); + sparam_mb->ctx_buf = NULL; + mempool_free(sparam_mb, phba->mbox_mem_pool); + goto sparam_out; + } + + phba->hba_flag |= HBA_DEFER_FLOGI; + } else { lpfc_initial_flogi(vport); + } } else { if (vport->fc_flag & FC_PT2PT) lpfc_disc_start(vport); @@ -1180,6 +1213,7 @@ lpfc_mbx_cmpl_local_config_link(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) "0306 CONFIG_LINK mbxStatus error x%x " "HBA state x%x\n", pmb->u.mb.mbxStatus, vport->port_state); +sparam_out: mempool_free(pmb, phba->mbox_mem_pool); lpfc_linkdown(phba); @@ -3235,21 +3269,6 @@ lpfc_mbx_process_link_up(struct lpfc_hba *phba, struct lpfc_mbx_read_top *la) lpfc_linkup(phba); sparam_mbox = NULL; - if (!(phba->hba_flag & HBA_FCOE_MODE)) { - cfglink_mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!cfglink_mbox) - goto out; - vport->port_state = LPFC_LOCAL_CFG_LINK; - lpfc_config_link(phba, cfglink_mbox); - cfglink_mbox->vport = vport; - cfglink_mbox->mbox_cmpl = lpfc_mbx_cmpl_local_config_link; - rc = lpfc_sli_issue_mbox(phba, cfglink_mbox, MBX_NOWAIT); - if (rc == MBX_NOT_FINISHED) { - mempool_free(cfglink_mbox, phba->mbox_mem_pool); - goto out; - } - } - sparam_mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); if (!sparam_mbox) goto out; @@ -3270,7 +3289,20 @@ lpfc_mbx_process_link_up(struct lpfc_hba *phba, struct lpfc_mbx_read_top *la) goto out; } - if (phba->hba_flag & HBA_FCOE_MODE) { + if (!(phba->hba_flag & HBA_FCOE_MODE)) { + cfglink_mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); + if (!cfglink_mbox) + goto out; + vport->port_state = LPFC_LOCAL_CFG_LINK; + lpfc_config_link(phba, cfglink_mbox); + cfglink_mbox->vport = vport; + cfglink_mbox->mbox_cmpl = lpfc_mbx_cmpl_local_config_link; + rc = lpfc_sli_issue_mbox(phba, cfglink_mbox, MBX_NOWAIT); + if (rc == MBX_NOT_FINISHED) { + mempool_free(cfglink_mbox, phba->mbox_mem_pool); + goto out; + } + } else { vport->port_state = LPFC_VPORT_UNKNOWN; /* * Add the driver's default FCF record at FCF index 0 now. This @@ -3327,10 +3359,6 @@ lpfc_mbx_process_link_up(struct lpfc_hba *phba, struct lpfc_mbx_read_top *la) } /* Reset FCF roundrobin bmask for new discovery */ lpfc_sli4_clear_fcf_rr_bmask(phba); - } else { - if (phba->bbcredit_support && phba->cfg_enable_bbcr && - !(phba->link_flag & LS_LOOPBACK_MODE)) - phba->hba_flag |= HBA_DEFER_FLOGI; } /* Prepare for LINK up registrations */ diff --git a/drivers/scsi/qla2xxx/qla_dbg.c b/drivers/scsi/qla2xxx/qla_dbg.c index 88a56e8480f710..269608a9643ec6 100644 --- a/drivers/scsi/qla2xxx/qla_dbg.c +++ b/drivers/scsi/qla2xxx/qla_dbg.c @@ -207,6 +207,7 @@ qla24xx_dump_ram(struct qla_hw_data *ha, uint32_t addr, uint32_t *ram, WRT_REG_WORD(®->mailbox0, MBC_DUMP_RISC_RAM_EXTENDED); WRT_REG_WORD(®->mailbox1, LSW(addr)); WRT_REG_WORD(®->mailbox8, MSW(addr)); + WRT_REG_WORD(®->mailbox10, 0); WRT_REG_WORD(®->mailbox2, MSW(LSD(dump_dma))); WRT_REG_WORD(®->mailbox3, LSW(LSD(dump_dma))); diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index df5b1a7aa3daf8..4a04ed97b90af4 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -4176,7 +4176,8 @@ qla2x00_dump_ram(scsi_qla_host_t *vha, dma_addr_t req_dma, uint32_t addr, if (MSW(addr) || IS_FWI2_CAPABLE(vha->hw)) { mcp->mb[0] = MBC_DUMP_RISC_RAM_EXTENDED; mcp->mb[8] = MSW(addr); - mcp->out_mb = MBX_8|MBX_0; + mcp->mb[10] = 0; + mcp->out_mb = MBX_10|MBX_8|MBX_0; } else { mcp->mb[0] = MBC_DUMP_RISC_RAM; mcp->out_mb = MBX_0; diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 2f3fc234cc54e5..f251019d59423f 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -132,7 +132,11 @@ show_transport_handle(struct device *dev, struct device_attribute *attr, char *buf) { struct iscsi_internal *priv = dev_to_iscsi_internal(dev); - return sprintf(buf, "%llu\n", (unsigned long long)iscsi_handle(priv->iscsi_transport)); + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + return scnprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)iscsi_handle(priv->iscsi_transport)); } static DEVICE_ATTR(handle, S_IRUGO, show_transport_handle, NULL); @@ -142,7 +146,7 @@ show_transport_##name(struct device *dev, \ struct device_attribute *attr,char *buf) \ { \ struct iscsi_internal *priv = dev_to_iscsi_internal(dev); \ - return sprintf(buf, format"\n", priv->iscsi_transport->name); \ + return scnprintf(buf, PAGE_SIZE, format"\n", priv->iscsi_transport->name);\ } \ static DEVICE_ATTR(name, S_IRUGO, show_transport_##name, NULL); @@ -183,7 +187,7 @@ static ssize_t show_ep_handle(struct device *dev, struct device_attribute *attr, char *buf) { struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev); - return sprintf(buf, "%llu\n", (unsigned long long) ep->id); + return scnprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long) ep->id); } static ISCSI_ATTR(ep, handle, S_IRUGO, show_ep_handle, NULL); @@ -2868,6 +2872,9 @@ iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev) struct iscsi_cls_session *session; int err = 0, value = 0; + if (ev->u.set_param.len > PAGE_SIZE) + return -EINVAL; + session = iscsi_session_lookup(ev->u.set_param.sid); conn = iscsi_conn_lookup(ev->u.set_param.sid, ev->u.set_param.cid); if (!conn || !session) @@ -3015,6 +3022,9 @@ iscsi_set_host_param(struct iscsi_transport *transport, if (!transport->set_host_param) return -ENOSYS; + if (ev->u.set_host_param.len > PAGE_SIZE) + return -EINVAL; + shost = scsi_host_lookup(ev->u.set_host_param.host_no); if (!shost) { printk(KERN_ERR "set_host_param could not find host no %u\n", @@ -3602,6 +3612,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) { int err = 0; u32 portid; + u32 pdu_len; struct iscsi_uevent *ev = nlmsg_data(nlh); struct iscsi_transport *transport = NULL; struct iscsi_internal *priv; @@ -3609,6 +3620,9 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) struct iscsi_cls_conn *conn; struct iscsi_endpoint *ep = NULL; + if (!netlink_capable(skb, CAP_SYS_ADMIN)) + return -EPERM; + if (nlh->nlmsg_type == ISCSI_UEVENT_PATH_UPDATE) *group = ISCSI_NL_GRP_UIP; else @@ -3741,6 +3755,14 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) err = -EINVAL; break; case ISCSI_UEVENT_SEND_PDU: + pdu_len = nlh->nlmsg_len - sizeof(*nlh) - sizeof(*ev); + + if ((ev->u.send_pdu.hdr_size > pdu_len) || + (ev->u.send_pdu.data_size > (pdu_len - ev->u.send_pdu.hdr_size))) { + err = -EINVAL; + break; + } + conn = iscsi_conn_lookup(ev->u.send_pdu.sid, ev->u.send_pdu.cid); if (conn) { mutex_lock(&conn_mutex); @@ -3945,7 +3967,7 @@ static ssize_t show_conn_state(struct device *dev, conn->state < ARRAY_SIZE(connection_state_names)) state = connection_state_names[conn->state]; - return sprintf(buf, "%s\n", state); + return scnprintf(buf, PAGE_SIZE, "%s\n", state); } static ISCSI_CLASS_ATTR(conn, state, S_IRUGO, show_conn_state, NULL); @@ -4173,7 +4195,7 @@ show_priv_session_state(struct device *dev, struct device_attribute *attr, char *buf) { struct iscsi_cls_session *session = iscsi_dev_to_session(dev->parent); - return sprintf(buf, "%s\n", iscsi_session_state_name(session->state)); + return scnprintf(buf, PAGE_SIZE, "%s\n", iscsi_session_state_name(session->state)); } static ISCSI_CLASS_ATTR(priv_sess, state, S_IRUGO, show_priv_session_state, NULL); @@ -4182,7 +4204,7 @@ show_priv_session_creator(struct device *dev, struct device_attribute *attr, char *buf) { struct iscsi_cls_session *session = iscsi_dev_to_session(dev->parent); - return sprintf(buf, "%d\n", session->creator); + return scnprintf(buf, PAGE_SIZE, "%d\n", session->creator); } static ISCSI_CLASS_ATTR(priv_sess, creator, S_IRUGO, show_priv_session_creator, NULL); @@ -4191,7 +4213,7 @@ show_priv_session_target_id(struct device *dev, struct device_attribute *attr, char *buf) { struct iscsi_cls_session *session = iscsi_dev_to_session(dev->parent); - return sprintf(buf, "%d\n", session->target_id); + return scnprintf(buf, PAGE_SIZE, "%d\n", session->target_id); } static ISCSI_CLASS_ATTR(priv_sess, target_id, S_IRUGO, show_priv_session_target_id, NULL); @@ -4204,8 +4226,8 @@ show_priv_session_##field(struct device *dev, \ struct iscsi_cls_session *session = \ iscsi_dev_to_session(dev->parent); \ if (session->field == -1) \ - return sprintf(buf, "off\n"); \ - return sprintf(buf, format"\n", session->field); \ + return scnprintf(buf, PAGE_SIZE, "off\n"); \ + return scnprintf(buf, PAGE_SIZE, format"\n", session->field); \ } #define iscsi_priv_session_attr_store(field) \ diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 5cca98c98bdb41..9746dad8f9168a 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -491,8 +491,7 @@ EXPORT_SYMBOL(iscsit_queue_rsp); void iscsit_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) { spin_lock_bh(&conn->cmd_lock); - if (!list_empty(&cmd->i_conn_node) && - !(cmd->se_cmd.transport_state & CMD_T_FABRIC_STOP)) + if (!list_empty(&cmd->i_conn_node)) list_del_init(&cmd->i_conn_node); spin_unlock_bh(&conn->cmd_lock); @@ -4075,12 +4074,22 @@ static void iscsit_release_commands_from_conn(struct iscsi_conn *conn) spin_lock_bh(&conn->cmd_lock); list_splice_init(&conn->conn_cmd_list, &tmp_list); - list_for_each_entry(cmd, &tmp_list, i_conn_node) { + list_for_each_entry_safe(cmd, cmd_tmp, &tmp_list, i_conn_node) { struct se_cmd *se_cmd = &cmd->se_cmd; if (se_cmd->se_tfo != NULL) { spin_lock_irq(&se_cmd->t_state_lock); - se_cmd->transport_state |= CMD_T_FABRIC_STOP; + if (se_cmd->transport_state & CMD_T_ABORTED) { + /* + * LIO's abort path owns the cleanup for this, + * so put it back on the list and let + * aborted_task handle it. + */ + list_move_tail(&cmd->i_conn_node, + &conn->conn_cmd_list); + } else { + se_cmd->transport_state |= CMD_T_FABRIC_STOP; + } spin_unlock_irq(&se_cmd->t_state_lock); } } diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c index 076c167b88992a..14d326990bffd3 100644 --- a/drivers/target/target_core_tmr.c +++ b/drivers/target/target_core_tmr.c @@ -129,14 +129,15 @@ void core_tmr_abort_task( struct se_tmr_req *tmr, struct se_session *se_sess) { - struct se_cmd *se_cmd; + struct se_cmd *se_cmd, *next; unsigned long flags; + bool rc; u64 ref_tag; - spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); - list_for_each_entry(se_cmd, &se_sess->sess_cmd_list, se_cmd_list) { + spin_lock_irqsave(&dev->execute_task_lock, flags); + list_for_each_entry_safe(se_cmd, next, &dev->state_list, state_list) { - if (dev != se_cmd->se_dev) + if (se_sess != se_cmd->se_sess) continue; /* skip task management functions, including tmr->task_cmd */ @@ -150,11 +151,16 @@ void core_tmr_abort_task( printk("ABORT_TASK: Found referenced %s task_tag: %llu\n", se_cmd->se_tfo->fabric_name, ref_tag); - if (!__target_check_io_state(se_cmd, se_sess, - dev->dev_attrib.emulate_tas)) + spin_lock(&se_sess->sess_cmd_lock); + rc = __target_check_io_state(se_cmd, se_sess, 0); + spin_unlock(&se_sess->sess_cmd_lock); + if (!rc) continue; - spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags); + list_del_init(&se_cmd->state_list); + se_cmd->state_active = false; + + spin_unlock_irqrestore(&dev->execute_task_lock, flags); /* * Ensure that this ABORT request is visible to the LU RESET @@ -172,7 +178,7 @@ void core_tmr_abort_task( atomic_long_inc(&dev->aborts_complete); return; } - spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags); + spin_unlock_irqrestore(&dev->execute_task_lock, flags); printk("ABORT_TASK: Sending TMR_TASK_DOES_NOT_EXIST for ref_tag: %lld\n", tmr->ref_task_tag); diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 4a0ff380b31ab6..7ce631df31bb3b 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -55,60 +55,83 @@ static int target_xcopy_gen_naa_ieee(struct se_device *dev, unsigned char *buf) return 0; } -struct xcopy_dev_search_info { - const unsigned char *dev_wwn; - struct se_device *found_dev; -}; - +/** + * target_xcopy_locate_se_dev_e4_iter - compare XCOPY NAA device identifiers + * + * @se_dev: device being considered for match + * @dev_wwn: XCOPY requested NAA dev_wwn + * @return: 1 on match, 0 on no-match + */ static int target_xcopy_locate_se_dev_e4_iter(struct se_device *se_dev, - void *data) + const unsigned char *dev_wwn) { - struct xcopy_dev_search_info *info = data; unsigned char tmp_dev_wwn[XCOPY_NAA_IEEE_REGEX_LEN]; int rc; - if (!se_dev->dev_attrib.emulate_3pc) + if (!se_dev->dev_attrib.emulate_3pc) { + pr_debug("XCOPY: emulate_3pc disabled on se_dev %p\n", se_dev); return 0; + } memset(&tmp_dev_wwn[0], 0, XCOPY_NAA_IEEE_REGEX_LEN); target_xcopy_gen_naa_ieee(se_dev, &tmp_dev_wwn[0]); - rc = memcmp(&tmp_dev_wwn[0], info->dev_wwn, XCOPY_NAA_IEEE_REGEX_LEN); - if (rc != 0) - return 0; - - info->found_dev = se_dev; - pr_debug("XCOPY 0xe4: located se_dev: %p\n", se_dev); - - rc = target_depend_item(&se_dev->dev_group.cg_item); + rc = memcmp(&tmp_dev_wwn[0], dev_wwn, XCOPY_NAA_IEEE_REGEX_LEN); if (rc != 0) { - pr_err("configfs_depend_item attempt failed: %d for se_dev: %p\n", - rc, se_dev); - return rc; + pr_debug("XCOPY: skip non-matching: %*ph\n", + XCOPY_NAA_IEEE_REGEX_LEN, tmp_dev_wwn); + return 0; } + pr_debug("XCOPY 0xe4: located se_dev: %p\n", se_dev); - pr_debug("Called configfs_depend_item for se_dev: %p se_dev->se_dev_group: %p\n", - se_dev, &se_dev->dev_group); return 1; } -static int target_xcopy_locate_se_dev_e4(const unsigned char *dev_wwn, - struct se_device **found_dev) +static int target_xcopy_locate_se_dev_e4(struct se_session *sess, + const unsigned char *dev_wwn, + struct se_device **_found_dev, + struct percpu_ref **_found_lun_ref) { - struct xcopy_dev_search_info info; - int ret; - - memset(&info, 0, sizeof(info)); - info.dev_wwn = dev_wwn; - - ret = target_for_each_device(target_xcopy_locate_se_dev_e4_iter, &info); - if (ret == 1) { - *found_dev = info.found_dev; - return 0; - } else { - pr_debug_ratelimited("Unable to locate 0xe4 descriptor for EXTENDED_COPY\n"); - return -EINVAL; + struct se_dev_entry *deve; + struct se_node_acl *nacl; + struct se_lun *this_lun = NULL; + struct se_device *found_dev = NULL; + + /* cmd with NULL sess indicates no associated $FABRIC_MOD */ + if (!sess) + goto err_out; + + pr_debug("XCOPY 0xe4: searching for: %*ph\n", + XCOPY_NAA_IEEE_REGEX_LEN, dev_wwn); + + nacl = sess->se_node_acl; + rcu_read_lock(); + hlist_for_each_entry_rcu(deve, &nacl->lun_entry_hlist, link) { + struct se_device *this_dev; + int rc; + + this_lun = rcu_dereference(deve->se_lun); + this_dev = rcu_dereference_raw(this_lun->lun_se_dev); + + rc = target_xcopy_locate_se_dev_e4_iter(this_dev, dev_wwn); + if (rc) { + if (percpu_ref_tryget_live(&this_lun->lun_ref)) + found_dev = this_dev; + break; + } } + rcu_read_unlock(); + if (found_dev == NULL) + goto err_out; + + pr_debug("lun_ref held for se_dev: %p se_dev->se_dev_group: %p\n", + found_dev, &found_dev->dev_group); + *_found_dev = found_dev; + *_found_lun_ref = &this_lun->lun_ref; + return 0; +err_out: + pr_debug_ratelimited("Unable to locate 0xe4 descriptor for EXTENDED_COPY\n"); + return -EINVAL; } static int target_xcopy_parse_tiddesc_e4(struct se_cmd *se_cmd, struct xcopy_op *xop, @@ -255,12 +278,16 @@ static int target_xcopy_parse_target_descriptors(struct se_cmd *se_cmd, switch (xop->op_origin) { case XCOL_SOURCE_RECV_OP: - rc = target_xcopy_locate_se_dev_e4(xop->dst_tid_wwn, - &xop->dst_dev); + rc = target_xcopy_locate_se_dev_e4(se_cmd->se_sess, + xop->dst_tid_wwn, + &xop->dst_dev, + &xop->remote_lun_ref); break; case XCOL_DEST_RECV_OP: - rc = target_xcopy_locate_se_dev_e4(xop->src_tid_wwn, - &xop->src_dev); + rc = target_xcopy_locate_se_dev_e4(se_cmd->se_sess, + xop->src_tid_wwn, + &xop->src_dev, + &xop->remote_lun_ref); break; default: pr_err("XCOPY CSCD descriptor IDs not found in CSCD list - " @@ -405,18 +432,12 @@ static int xcopy_pt_get_cmd_state(struct se_cmd *se_cmd) static void xcopy_pt_undepend_remotedev(struct xcopy_op *xop) { - struct se_device *remote_dev; - if (xop->op_origin == XCOL_SOURCE_RECV_OP) - remote_dev = xop->dst_dev; + pr_debug("putting dst lun_ref for %p\n", xop->dst_dev); else - remote_dev = xop->src_dev; - - pr_debug("Calling configfs_undepend_item for" - " remote_dev: %p remote_dev->dev_group: %p\n", - remote_dev, &remote_dev->dev_group.cg_item); + pr_debug("putting src lun_ref for %p\n", xop->src_dev); - target_undepend_item(&remote_dev->dev_group.cg_item); + percpu_ref_put(xop->remote_lun_ref); } static void xcopy_pt_release_cmd(struct se_cmd *se_cmd) diff --git a/drivers/target/target_core_xcopy.h b/drivers/target/target_core_xcopy.h index 26ba4c3c9cffda..974bc1e19ff2be 100644 --- a/drivers/target/target_core_xcopy.h +++ b/drivers/target/target_core_xcopy.h @@ -29,6 +29,7 @@ struct xcopy_op { struct se_device *dst_dev; unsigned char dst_tid_wwn[XCOPY_NAA_IEEE_REGEX_LEN]; unsigned char local_dev_wwn[XCOPY_NAA_IEEE_REGEX_LEN]; + struct percpu_ref *remote_lun_ref; sector_t src_lba; sector_t dst_lba; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2bf8841ec490af..96f95f47afd34b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1876,9 +1876,11 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) * not already there, and calling reverse_path_check() * during ep_insert(). */ - if (list_empty(&epi->ffd.file->f_tfile_llink)) - list_add(&epi->ffd.file->f_tfile_llink, - &tfile_check_list); + if (list_empty(&epi->ffd.file->f_tfile_llink)) { + if (get_file_rcu(epi->ffd.file)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); + } } } mutex_unlock(&ep->mtx); @@ -1922,6 +1924,7 @@ static void clear_tfile_check_list(void) file = list_first_entry(&tfile_check_list, struct file, f_tfile_llink); list_del_init(&file->f_tfile_llink); + fput(file); } INIT_LIST_HEAD(&tfile_check_list); } @@ -2082,25 +2085,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; - if (ep_loop_check(ep, tf.file) != 0) { - clear_tfile_check_list(); + if (ep_loop_check(ep, tf.file) != 0) goto error_tgt_fput; - } - } else + } else { + get_file(tf.file); list_add(&tf.file->f_tfile_llink, &tfile_check_list); + } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); - if (error) { -out_del: - list_del(&tf.file->f_tfile_llink); + if (error) goto error_tgt_fput; - } if (is_file_epoll(tf.file)) { tep = tf.file->private_data; error = epoll_mutex_lock(&tep->mtx, 1, nonblock); if (error) { mutex_unlock(&ep->mtx); - goto out_del; + goto error_tgt_fput; } } } @@ -2121,8 +2121,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; - if (full_check) - clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -2145,8 +2143,10 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, mutex_unlock(&ep->mtx); error_tgt_fput: - if (full_check) + if (full_check) { + clear_tfile_check_list(); mutex_unlock(&epmutex); + } fdput(tf); error_fput: diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6a38fbfbe6f872..94b90cf045d4ad 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1041,7 +1041,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); - INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) + INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { @@ -1893,7 +1894,12 @@ bool gfs2_delete_work_queued(const struct gfs2_glock *gl) static void flush_delete_work(struct gfs2_glock *gl) { - flush_delayed_work(&gl->gl_delete); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { + if (cancel_delayed_work(&gl->gl_delete)) { + queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, 0); + } + } gfs2_glock_queue_work(gl, 0); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8fa28ceae25c00..0863eb8fc7242b 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -152,6 +152,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) goto fail; + if (blktype != GFS2_BLKST_UNLINKED) + gfs2_cancel_delete_work(io_gl); if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) { /* @@ -182,7 +184,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail; - gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -726,13 +727,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, flush_delayed_work(&ip->i_gl->gl_work); glock_set_object(ip->i_gl, ip); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_free_inode; + gfs2_cancel_delete_work(io_gl); + glock_set_object(io_gl, ip); + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_gunlock2; error = gfs2_trans_begin(sdp, blocks, 0); if (error) - goto fail_free_inode; + goto fail_gunlock2; if (blocks > 1) { ip->i_eattr = ip->i_no_addr + 1; @@ -741,18 +748,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, init_dinode(dip, ip, symname); gfs2_trans_end(sdp); - error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (error) - goto fail_free_inode; - BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags)); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (error) goto fail_gunlock2; - gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); - glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -804,6 +805,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_gunlock2: clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + glock_clear_object(io_gl, ip); gfs2_glock_put(io_gl); fail_free_inode: if (ip->i_gl) { diff --git a/include/linux/audit.h b/include/linux/audit.h index d7e5e0ceea2944..d054416c187e8d 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -279,7 +279,6 @@ extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); - extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); @@ -566,14 +565,6 @@ static inline struct filename *audit_reusename(const __user char *name) } static inline void audit_getname(struct filename *name) { } -static inline void __audit_inode(struct filename *name, - const struct dentry *dentry, - unsigned int flags) -{ } -static inline void __audit_inode_child(struct inode *parent, - const struct dentry *dentry, - const unsigned char type) -{ } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) diff --git a/include/linux/dmar.h b/include/linux/dmar.h index efcfd229ab8344..368a86c1e0e55c 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -60,6 +60,7 @@ struct dmar_drhd_unit { u16 segment; /* PCI domain */ u8 ignored:1; /* ignore drhd */ u8 include_all:1; + u8 gfx_dedicated:1; /* graphic dedicated */ struct intel_iommu *iommu; }; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 6fd0e91bbeb907..bf6f2a61a15ed0 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -569,6 +569,8 @@ struct intel_iommu { struct iommu_device iommu; /* IOMMU core code handle */ int node; u32 flags; /* Software defined flags */ + + struct dmar_drhd_unit *drhd; }; /* PCI domain-device relationship */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 00d82280d71489..e01936a944904b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2110,7 +2110,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum memmap_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 31bf67ac9b4d46..9c8d3f85a66481 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -859,10 +859,15 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx); -enum memmap_context { - MEMMAP_EARLY, - MEMMAP_HOTPLUG, +/* + * Memory initialization context, use to differentiate memory added by + * the platform statically or via memory hotplug interface. + */ +enum meminit_context { + MEMINIT_EARLY, + MEMINIT_HOTPLUG, }; + extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); diff --git a/include/linux/node.h b/include/linux/node.h index 4866f32a02d8d4..8e5a29897936c8 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -99,13 +99,14 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) -extern int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn); +void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else -static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn) +static inline void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context) { - return 0; } #endif @@ -128,7 +129,7 @@ static inline int register_one_node(int nid) if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, start_pfn, end_pfn); + link_mem_sections(nid, start_pfn, end_pfn, MEMINIT_EARLY); } return error; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3d427e9b6d6e4a..397b093b625cf6 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -226,8 +226,9 @@ void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long low_limit, unsigned long high_limit); -bool inet_ehash_insert(struct sock *sk, struct sock *osk); -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, + bool *found_dup_sk); int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/kernel/audit.c b/kernel/audit.c index 1d141178e6c588..3f65235f68407c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1864,7 +1864,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_clear_dummy(ab->ctx); + /* cancel dummy context to enable supporting records */ + if (ctx) + ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); diff --git a/kernel/audit.h b/kernel/audit.h index dde1e47f3afcc2..5992c7dc939c36 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -304,13 +304,6 @@ extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); - -static inline void audit_clear_dummy(struct audit_context *ctx) -{ - if (ctx) - ctx->dummy = 0; -} - #else /* CONFIG_AUDITSYSCALL */ #define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} @@ -345,7 +338,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) } #define audit_filter_inodes(t, c) AUDIT_DISABLED -#define audit_clear_dummy(c) {} #endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 468a2339045784..d567ef4e0de477 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -911,6 +911,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); + context->fds[0] = -1; return context; } @@ -1349,7 +1350,10 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, /* name was specified as a relative path and the * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); + if (context->pwd.dentry && context->pwd.mnt) + audit_log_d_path(ab, " name=", &context->pwd); + else + audit_log_format(ab, " name=(null)"); break; default: /* log the name's directory component */ @@ -1845,6 +1849,8 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, list_add_tail(&aname->list, &context->names_list); context->name_count++; + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); return aname; } @@ -1896,9 +1902,6 @@ void __audit_getname(struct filename *name) n->name_len = AUDIT_NAME_FULL; name->aname = n; name->refcnt++; - - if (!context->pwd.dentry) - get_fs_pwd(current->fs, &context->pwd); } static inline int audit_copy_fcaps(struct audit_names *name, diff --git a/kernel/futex.c b/kernel/futex.c index 1dfb63ed84b1ed..1cf4a98ac54a45 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -489,7 +489,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -510,8 +510,8 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -548,7 +548,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a again: /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(fshared))) + if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); @@ -636,7 +636,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (unlikely(should_fail_futex(fshared)) || ro) { + if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } @@ -687,10 +687,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a return err; } -static inline void put_futex_key(union futex_key *key) -{ -} - /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -795,6 +791,29 @@ static struct futex_pi_state *alloc_pi_state(void) return pi_state; } +static void pi_state_update_owner(struct futex_pi_state *pi_state, + struct task_struct *new_owner) +{ + struct task_struct *old_owner = pi_state->owner; + + lockdep_assert_held(&pi_state->pi_mutex.wait_lock); + + if (old_owner) { + raw_spin_lock(&old_owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&old_owner->pi_lock); + } + + if (new_owner) { + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + } +} + static void get_pi_state(struct futex_pi_state *pi_state) { WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); @@ -817,17 +836,12 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - struct task_struct *owner; + unsigned long flags; - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - owner = pi_state->owner; - if (owner) { - raw_spin_lock(&owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock(&owner->pi_lock); - } - rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); + pi_state_update_owner(pi_state, NULL); + rt_mutex_proxy_unlock(&pi_state->pi_mutex); + raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } if (current->pi_state_cache) { @@ -972,7 +986,8 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * FUTEX_OWNER_DIED bit. See [4] * * [10] There is no transient state which leaves owner and user space - * TID out of sync. + * TID out of sync. Except one error case where the kernel is denied + * write access to the user address, see fixup_pi_state_owner(). * * * Serialization and lifetime rules: @@ -1533,8 +1548,10 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (unlikely(should_fail_futex(true))) + if (unlikely(should_fail_futex(true))) { ret = -EFAULT; + goto out_unlock; + } ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); if (!ret && (curval != uval)) { @@ -1550,26 +1567,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ ret = -EINVAL; } - if (ret) - goto out_unlock; - - /* - * This is a point of no return; once we modify the uval there is no - * going back and subsequent operations must not fail. - */ - - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - - raw_spin_lock(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock(&new_owner->pi_lock); - - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + if (!ret) { + /* + * This is a point of no return; once we modified the uval + * there is no going back and subsequent operations must + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + } out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); @@ -1621,13 +1627,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out_put_key; + return ret; spin_lock(&hb->lock); @@ -1650,9 +1656,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); -out: return ret; } @@ -1720,10 +1723,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1741,13 +1744,13 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, * an MMU, but we might get them from range checking */ ret = op_ret; - goto out_put_keys; + return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out_put_keys; + return ret; } if (!(flags & FLAGS_SHARED)) { @@ -1755,8 +1758,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, goto retry_private; } - put_futex_key(&key2); - put_futex_key(&key1); cond_resched(); goto retry; } @@ -1792,11 +1793,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret; } @@ -2003,20 +1999,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out_put_keys; - } + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2036,13 +2030,11 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ret = get_user(curval, uaddr1); if (ret) - goto out_put_keys; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&key2); - put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2101,12 +2093,10 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - goto out; + return ret; case -EBUSY: case -EAGAIN: /* @@ -2117,8 +2107,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2227,12 +2215,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); - -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret ? ret : task_count; } @@ -2377,18 +2359,13 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *argowner) +static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, uninitialized_var(curval), newval; struct task_struct *oldowner, *newowner; - u32 newtid; - int ret, err = 0; - - lockdep_assert_held(q->lock_ptr); - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + u32 uval, curval, newval, newtid; + int err = 0; oldowner = pi_state->owner; @@ -2422,14 +2399,12 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 0; - goto out_unlock; + return 0; } if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { - /* We got the lock after all, nothing to fix. */ - ret = 0; - goto out_unlock; + /* We got the lock. pi_state is correct. Tell caller. */ + return 1; } /* @@ -2444,8 +2419,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 0; - goto out_unlock; + return 1; } newowner = argowner; } @@ -2475,22 +2449,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * We fixed up user space. Now we need to fix the pi_state * itself. */ - if (pi_state->owner != NULL) { - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - } - - pi_state->owner = newowner; + pi_state_update_owner(pi_state, newowner); - raw_spin_lock(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock(&newowner->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - - return 0; + return argowner == current; /* * In order to reschedule or handle a page fault, we need to drop the @@ -2511,17 +2472,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, switch (err) { case -EFAULT: - ret = fault_in_user_writeable(uaddr); + err = fault_in_user_writeable(uaddr); break; case -EAGAIN: cond_resched(); - ret = 0; + err = 0; break; default: WARN_ON_ONCE(1); - ret = err; break; } @@ -2531,17 +2491,44 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) { - ret = 0; - goto out_unlock; - } + if (pi_state->owner != oldowner) + return argowner == current; - if (ret) - goto out_unlock; + /* Retry if err was -EAGAIN or the fault in succeeded */ + if (!err) + goto retry; - goto retry; + /* + * fault_in_user_writeable() failed so user state is immutable. At + * best we can make the kernel state consistent but user state will + * be most likely hosed and any subsequent unlock operation will be + * rejected due to PI futex rule [10]. + * + * Ensure that the rtmutex owner is also the pi_state owner despite + * the user space value claiming something different. There is no + * point in unlocking the rtmutex if current is the owner as it + * would need to wait until the next waiter has taken the rtmutex + * to guarantee consistent state. Keep it simple. Userspace asked + * for this wreckaged state. + * + * The rtmutex has an owner - either current or some other + * task. See the EAGAIN loop above. + */ + pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); -out_unlock: + return err; +} + +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) +{ + struct futex_pi_state *pi_state = q->pi_state; + int ret; + + lockdep_assert_held(q->lock_ptr); + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + ret = __fixup_pi_state_owner(uaddr, q, argowner); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } @@ -2565,8 +2552,6 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - int ret = 0; - if (locked) { /* * Got the lock. We might not be the anticipated owner if we @@ -2577,8 +2562,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current); - goto out; + return fixup_pi_state_owner(uaddr, q, current); + return 1; } /* @@ -2589,24 +2574,17 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Another speculative read; pi_state->owner == current is unstable * but needs our attention. */ - if (q->pi_state->owner == current) { - ret = fixup_pi_state_owner(uaddr, q, NULL); - goto out; - } + if (q->pi_state->owner == current) + return fixup_pi_state_owner(uaddr, q, NULL); /* * Paranoia check. If we did not take the lock, then we should not be - * the owner of the rt_mutex. + * the owner of the rt_mutex. Warn and establish consistent state. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { - printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " - "pi-state %p\n", ret, - q->pi_state->pi_mutex.owner, - q->pi_state->owner); - } + if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) + return fixup_pi_state_owner(uaddr, q, current); -out: - return ret ? ret : locked; + return 0; } /** @@ -2703,12 +2681,11 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ret = get_user(uval, uaddr); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q->key); goto retry; } @@ -2717,9 +2694,6 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ret = -EWOULDBLOCK; } -out: - if (ret) - put_futex_key(&q->key); return ret; } @@ -2818,7 +2792,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -2864,7 +2837,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, * - EAGAIN: The user space value changed. */ queue_unlock(hb); - put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2955,30 +2927,13 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; - /* - * If fixup_owner() faulted and was unable to handle the fault, unlock - * it and return the fault to userspace. - */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* Unqueue and drop the lock */ unqueue_me_pi(&q); - - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - - goto out_put_key; + goto out; out_unlock_put_key: queue_unlock(hb); -out_put_key: - put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2991,12 +2946,11 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ret = fault_in_user_writeable(uaddr); if (ret) - goto out_put_key; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q.key); goto retry; } @@ -3125,16 +3079,13 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) out_unlock: spin_unlock(&hb->lock); out_putkey: - put_futex_key(&key); return ret; pi_retry: - put_futex_key(&key); cond_resched(); goto retry; pi_faulted: - put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3237,7 +3188,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; @@ -3276,7 +3226,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -3285,7 +3235,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out_put_keys; + goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3295,7 +3245,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out_put_keys; + goto out; /* * In order for us to be here, we know our q.key == key2, and since @@ -3315,16 +3265,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. + */ + ret = ret < 0 ? ret : 0; } } else { struct rt_mutex *pi_mutex; @@ -3355,25 +3306,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; - /* - * If fixup_pi_state_owner() faulted and was unable to handle - * the fault, unlock the rt_mutex and return the fault to - * userspace. - */ - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling @@ -3385,11 +3321,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - out: if (to) { hrtimer_cancel(&to->timer); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index d40dfc9676f415..a1b056f604f1ab 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1717,8 +1717,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, * possible because it belongs to the pi_state which is about to be freed * and it is not longer visible to other tasks. */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) +void rt_mutex_proxy_unlock(struct rt_mutex *lock) { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index d1d62f942be228..ca6fb489007b6b 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -133,8 +133,7 @@ enum rtmutex_chainwalk { extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3523ddcaf49ecf..c6ce010beb882c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -243,8 +243,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -269,7 +270,6 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -277,9 +277,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); - - hrtimer_set_expires(timer, time); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); if (rq == this_rq()) { __hrtick_restart(rq); @@ -3538,7 +3536,7 @@ static void __sched notrace __schedule(bool preempt) schedule_debug(prev); - if (sched_feat(HRTICK)) + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b018ceb6853b55..f9ae07b9f7e96e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1783,7 +1783,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, p); if (rq->curr->sched_class != &dl_sched_class) @@ -1846,7 +1846,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * not being the leftmost task anymore. In that case NEED_RESCHED will * be set and schedule() will start a new hrtick for the next task. */ - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a583f7a56653c..a8f9fccb57bc72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5164,7 +5164,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -6758,7 +6758,7 @@ done: __maybe_unused; list_move(&p->se.group_node, &rq->cfs_tasks); #endif - if (hrtick_enabled(rq)) + if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); update_misfit_status(p, rq); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7481cd96f39155..d9aedf241ba541 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -38,6 +38,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) +SCHED_FEAT(HRTICK_DL, false) SCHED_FEAT(DOUBLE_TICK, false) /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 848f875debb562..253a03757e801e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -959,7 +959,11 @@ struct rq { struct cpuidle_state *idle_state; #endif +#if defined(CONFIG_SCHED_HRTICK) && defined(CONFIG_SMP) + RH_KABI_USE(1, ktime_t hrtick_time) +#else RH_KABI_RESERVE(1) +#endif RH_KABI_RESERVE(2) #ifdef CONFIG_NUMA_BALANCING RH_KABI_EXTEND(unsigned int numa_migrate_on) @@ -1940,17 +1944,39 @@ extern const_debug unsigned int sysctl_sched_migration_cost; */ static inline int hrtick_enabled(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; if (!cpu_active(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } +static inline int hrtick_enabled_fair(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtick_enabled(rq); +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + if (!sched_feat(HRTICK_DL)) + return 0; + return hrtick_enabled(rq); +} + void hrtick_start(struct rq *rq, u64 delay); #else +static inline int hrtick_enabled_fair(struct rq *rq) +{ + return 0; +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + return 0; +} + static inline int hrtick_enabled(struct rq *rq) { return 0; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ffe9399788d2da..b95e26d1b4c962 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -743,7 +743,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMMAP_HOTPLUG, altmap); + MEMINIT_HOTPLUG, altmap); set_zone_contiguous(zone); } @@ -1117,8 +1117,8 @@ int __ref add_memory_resource(int nid, struct resource *res) } /* link memory sections under this node.*/ - ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); - BUG_ON(ret); + link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); diff --git a/mm/mmap.c b/mm/mmap.c index 0dd353d446ade5..df7fab1c1145b7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3072,9 +3072,7 @@ void exit_mmap(struct mm_struct *mm) * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ - mutex_lock(&oom_lock); __oom_reap_task_mm(mm); - mutex_unlock(&oom_lock); set_bit(MMF_OOM_SKIP, &mm->flags); down_write(&mm->mmap_sem); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e09692e9bf2540..28fad5c96903eb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -530,28 +530,9 @@ void __oom_reap_task_mm(struct mm_struct *mm) static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { - bool ret = true; - - /* - * We have to make sure to not race with the victim exit path - * and cause premature new oom victim selection: - * oom_reap_task_mm exit_mm - * mmget_not_zero - * mmput - * atomic_dec_and_test - * exit_oom_victim - * [...] - * out_of_memory - * select_bad_process - * # no TIF_MEMDIE task selects new victim - * unmap_page_range # frees some memory - */ - mutex_lock(&oom_lock); - if (!down_read_trylock(&mm->mmap_sem)) { - ret = false; trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + return false; } /* @@ -563,7 +544,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) if (mm_has_blockable_invalidate_notifiers(mm)) { up_read(&mm->mmap_sem); schedule_timeout_idle(HZ); - goto unlock_oom; + return true; } /* @@ -575,7 +556,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) if (test_bit(MMF_OOM_SKIP, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + return true; } trace_start_task_reaping(tsk->pid); @@ -590,9 +571,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) up_read(&mm->mmap_sem); trace_finish_task_reaping(tsk->pid); -unlock_oom: - mutex_unlock(&oom_lock); - return ret; + return true; } #define MAX_OOM_REAP_RETRIES 10 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e8629750b09e5..5198110a21229a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5565,7 +5565,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum memmap_context context, + unsigned long start_pfn, enum meminit_context context, struct vmem_altmap *altmap) { unsigned long end_pfn = start_pfn + size; @@ -5603,7 +5603,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context != MEMMAP_EARLY) + if (context != MEMINIT_EARLY) goto not_early; if (!early_pfn_valid(pfn)) @@ -5638,7 +5638,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); - if (context == MEMMAP_HOTPLUG) + if (context == MEMINIT_HOTPLUG) __SetPageReserved(page); /* @@ -5723,7 +5723,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. * - * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ if (!(pfn & (pageblock_nr_pages - 1))) { @@ -5748,7 +5748,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) + memmap_init_zone((size), (nid), (zone), (start_pfn), MEMINIT_EARLY, NULL) #endif static int zone_batchsize(struct zone *zone) diff --git a/mm/slub.c b/mm/slub.c index 38af38099f1550..d7ab0b7a1ee05d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2705,7 +2705,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) { + if (unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); stat(s, ALLOC_SLOWPATH); } else { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 176bddacc16eb7..7e93087d136674 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -428,7 +428,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); if (*own_req) ireq->ireq_opt = NULL; else diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 945435bac38a84..a34e4be166ccd0 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -533,7 +533,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, dccp_done(newsk); goto out; } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 87a178665c2f83..5372f5b068cdc4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -761,7 +761,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); - inet_ehash_insert(req_to_sk(req), NULL); + inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 411dd7a90046ef..b0d7b81cf489de 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -24,6 +24,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif #include #include #include @@ -511,10 +514,52 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -/* insert a socket into ehash, and eventually remove another one - * (The another one can be a SYN_RECV or TIMEWAIT +/* Searches for an exsiting socket in the ehash bucket list. + * Returns true if found, false otherwise. */ -bool inet_ehash_insert(struct sock *sk, struct sock *osk) +static bool inet_ehash_lookup_by_sk(struct sock *sk, + struct hlist_nulls_head *list) +{ + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); + const int sdif = sk->sk_bound_dev_if; + const int dif = sk->sk_bound_dev_if; + const struct hlist_nulls_node *node; + struct net *net = sock_net(sk); + struct sock *esk; + + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); + + sk_nulls_for_each_rcu(esk, node, list) { + if (esk->sk_hash != sk->sk_hash) + continue; + if (sk->sk_family == AF_INET) { + if (unlikely(INET_MATCH(esk, net, acookie, + sk->sk_daddr, + sk->sk_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#if IS_ENABLED(CONFIG_IPV6) + else if (sk->sk_family == AF_INET6) { + if (unlikely(INET6_MATCH(esk, net, + &sk->sk_v6_daddr, + &sk->sk_v6_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#endif + } + return false; +} + +/* Insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT) + * If an existing socket already exists, socket sk is not inserted, + * and sets found_dup_sk parameter to true. + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; @@ -533,16 +578,23 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk) if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); + } else if (found_dup_sk) { + *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); + if (*found_dup_sk) + ret = false; } + if (ret) __sk_nulls_add_node_rcu(sk, list); + spin_unlock(lock); + return ret; } -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - bool ok = inet_ehash_insert(sk, osk); + bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -585,7 +637,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { - inet_ehash_nolisten(sk, osk); + inet_ehash_nolisten(sk, osk, NULL); return 0; } WARN_ON(!sk_unhashed(sk)); @@ -682,7 +734,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); spin_unlock_bh(&head->lock); return 0; } @@ -758,7 +810,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cec58dbdfbe309..c6e89afa1ce57f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1404,6 +1404,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, bool *own_req) { struct inet_request_sock *ireq; + bool found_dup_sk = false; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; @@ -1474,12 +1475,22 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (likely(*own_req)) { tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { newinet->inet_opt = NULL; + + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } } return newsk; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 46e60f6a053188..f6c6f74cd9bc5f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1088,6 +1088,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * struct ipv6_txoptions *opt; struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; + bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG @@ -1258,7 +1259,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_done(newsk); goto out; } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); @@ -1273,6 +1275,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * skb_set_owner_r(newnp->pktoptions, newsk); } } + } else { + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } } return newsk; diff --git a/net/sctp/proc.c b/net/sctp/proc.c index a644292f9fafd6..84f79ac4b9842a 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -230,6 +230,12 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; + if (v && v != SEQ_START_TOKEN) { + struct sctp_transport *transport = v; + + sctp_transport_put(transport); + } + sctp_transport_walk_stop(&iter->hti); } @@ -237,6 +243,12 @@ static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; + if (v && v != SEQ_START_TOKEN) { + struct sctp_transport *transport = v; + + sctp_transport_put(transport); + } + ++*pos; return sctp_transport_get_next(seq_file_net(seq), &iter->hti); @@ -292,8 +304,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) sk->sk_rcvbuf); seq_printf(seq, "\n"); - sctp_transport_put(transport); - return 0; } @@ -369,8 +379,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - sctp_transport_put(transport); - return 0; } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7de3364f7394da..5d613bb568edfe 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -926,10 +926,12 @@ static int vsock_shutdown(struct socket *sock, int mode) */ sk = sock->sk; + + lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sk->sk_type == SOCK_STREAM) - return err; + goto out; } else { sock->state = SS_DISCONNECTING; err = 0; @@ -938,10 +940,8 @@ static int vsock_shutdown(struct socket *sock, int mode) /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { - lock_sock(sk); sk->sk_shutdown |= mode; sk->sk_state_change(sk); - release_sock(sk); if (sk->sk_type == SOCK_STREAM) { sock_reset_flag(sk, SOCK_DONE); @@ -949,6 +949,8 @@ static int vsock_shutdown(struct socket *sock, int mode) } } +out: + release_sock(sk); return err; } @@ -997,9 +999,12 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock->type == SOCK_STREAM) { - const struct vsock_transport *transport = vsk->transport; + const struct vsock_transport *transport; + lock_sock(sk); + transport = vsk->transport; + /* Listening sockets that have connections in their accept * queue can be read. */ @@ -1082,10 +1087,11 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = 0; sk = sock->sk; vsk = vsock_sk(sk); - transport = vsk->transport; lock_sock(sk); + transport = vsk->transport; + err = vsock_auto_bind(vsk); if (err) goto out; @@ -1214,7 +1220,7 @@ static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { const struct vsock_transport *transport = vsk->transport; - if (!transport->cancel_pkt) + if (!transport || !transport->cancel_pkt) return -EOPNOTSUPP; return transport->cancel_pkt(vsk); @@ -1224,7 +1230,6 @@ static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; - int cancel = 0; vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); @@ -1235,11 +1240,9 @@ static void vsock_connect_timeout(struct work_struct *work) sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); - cancel = 1; + vsock_transport_cancel_pkt(vsk); } release_sock(sk); - if (cancel) - vsock_transport_cancel_pkt(vsk); sock_put(sk); } @@ -1546,10 +1549,11 @@ static int vsock_stream_setsockopt(struct socket *sock, err = 0; sk = sock->sk; vsk = vsock_sk(sk); - transport = vsk->transport; lock_sock(sk); + transport = vsk->transport; + switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); @@ -1682,7 +1686,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, sk = sock->sk; vsk = vsock_sk(sk); - transport = vsk->transport; total_written = 0; err = 0; @@ -1691,6 +1694,8 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); + transport = vsk->transport; + /* Callers should not provide a destination with stream sockets. */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; @@ -1825,11 +1830,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sk = sock->sk; vsk = vsock_sk(sk); - transport = vsk->transport; err = 0; lock_sock(sk); + transport = vsk->transport; + if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 92a56803d62bfb..9e98fe9c66cb06 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -515,14 +515,10 @@ static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode) static int hvs_shutdown(struct vsock_sock *vsk, int mode) { - struct sock *sk = sk_vsock(vsk); - if (!(mode & SEND_SHUTDOWN)) return 0; - lock_sock(sk); hvs_shutdown_lock_held(vsk->trans, mode); - release_sock(sk); return 0; } diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index e5db5d4ea4138f..5af957d0b1fefe 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -218,7 +218,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ /* Virtualization flags: Linux defined, word 8 */ diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index 54cdefdfb49de2..d59f3eb67c8f55 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -76,10 +76,8 @@ void set_default_state(struct kvm_nested_state *state) void set_default_vmx_state(struct kvm_nested_state *state, int size) { memset(state, 0, size); - state->flags = KVM_STATE_NESTED_GUEST_MODE | - KVM_STATE_NESTED_RUN_PENDING; if (have_evmcs) - state->flags |= KVM_STATE_NESTED_EVMCS; + state->flags = KVM_STATE_NESTED_EVMCS; state->format = 0; state->size = size; state->hdr.vmx.vmxon_pa = 0x1000; @@ -148,6 +146,11 @@ void test_vmx_nested_state(struct kvm_vm *vm) state->hdr.vmx.smm.flags = 1; test_nested_state_expect_einval(vm, state); + /* Invalid flags are rejected. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vm, state); + /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = -1ull; @@ -185,20 +188,41 @@ void test_vmx_nested_state(struct kvm_vm *vm) state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; test_nested_state_expect_einval(vm, state); - /* Size must be large enough to fit kvm_nested_state and vmcs12. */ + /* + * Size must be large enough to fit kvm_nested_state and vmcs12 + * if VMCS12 physical address is set + */ set_default_vmx_state(state, state_sz); state->size = sizeof(*state); + state->flags = 0; + test_nested_state_expect_einval(vm, state); + + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; test_nested_state(vm, state); - /* vmxon_pa cannot be the same address as vmcs_pa. */ + /* + * KVM_SET_NESTED_STATE succeeds with invalid VMCS + * contents but L2 not running. + */ set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = 0; - state->hdr.vmx.vmcs12_pa = 0; + state->flags = 0; + test_nested_state(vm, state); + + /* Invalid flags are rejected, even if no VMCS loaded. */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + state->hdr.vmx.flags = ~0; test_nested_state_expect_einval(vm, state); - /* The revision id for vmcs12 must be VMCS12_REVISION. */ + /* vmxon_pa cannot be the same address as vmcs_pa. */ set_default_vmx_state(state, state_sz); - set_revision_id_for_vmcs12(state, 0); + state->hdr.vmx.vmxon_pa = 0; + state->hdr.vmx.vmcs12_pa = 0; test_nested_state_expect_einval(vm, state); /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 51572037eb0cc5..b5b16eacda35c8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1239,6 +1239,7 @@ int __kvm_set_memory_region(struct kvm *kvm, return -EINVAL; /* We can read the guest memory with __xxx_user() later on. */ if ((mem->userspace_addr & (PAGE_SIZE - 1)) || + (mem->userspace_addr != untagged_addr(mem->userspace_addr)) || !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size)) return -EINVAL;