This file allows to turn off the disk entropy contribution. Default
value of this file is '1'(on).
+dax (RO)
+--------
+This file indicates whether the device supports Direct Access (DAX),
+used by CPU-addressable storage to bypass the pagecache. It shows '1'
+if true, '0' if not.
+
discard_granularity (RO)
-----------------------
This shows the size of internal allocation of the device in bytes, if
-------------------
This is the hardware sector size of the device, in bytes.
+io_poll (RW)
+------------
+When read, this file shows the total number of block IO polls and how
+many returned success. Writing '0' to this file will disable polling
+for this device. Writing any non-zero value will enable this feature.
+
iostats (RW)
-------------
This file is used to control (on/off) the iostats accounting of the
setting from "write back" to "write through", since that will also
eliminate cache flushes issued by the kernel.
+write_same_max_bytes (RO)
+-------------------------
+This is the number of bytes the device can write in a single write-same
+command. A value of '0' means write-same is not supported by this
+device.
+
Jens Axboe <jens.axboe@oracle.com>, February 2009
switch (ioctl) {
case KVM_CREATE_IRQCHIP: {
+ int ret;
if (!vgic_present)
return -ENXIO;
- return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+ mutex_lock(&kvm->lock);
+ ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+ mutex_unlock(&kvm->lock);
+ return ret;
}
case KVM_ARM_SET_DEVICE_ADDR: {
struct kvm_arm_device_addr dev_addr;
-# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_AUDIT=y
CONFIG_LOG_BUF_SHIFT=14
CONFIG_MEMCG=y
CONFIG_MEMCG_SWAP=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_PIDS=y
CONFIG_CGROUP_HUGETLB=y
-# CONFIG_UTS_NS is not set
-# CONFIG_IPC_NS is not set
-# CONFIG_NET_NS is not set
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_USER_NS=y
CONFIG_SCHED_AUTOGROUP=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_KALLSYMS_ALL=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_CMA=y
+CONFIG_SECCOMP=y
CONFIG_XEN=y
CONFIG_KEXEC=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
-# CONFIG_IPV6 is not set
+CONFIG_IPV6=m
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
+CONFIG_NETFILTER_XT_TARGET_LOG=m
+CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NF_CONNTRACK_IPV4=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_NF_CONNTRACK_IPV6=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_NAT=m
+CONFIG_IP6_NF_TARGET_MASQUERADE=m
+CONFIG_BRIDGE=m
+CONFIG_BRIDGE_VLAN_FILTERING=y
+CONFIG_VLAN_8021Q=m
+CONFIG_VLAN_8021Q_GVRP=y
+CONFIG_VLAN_8021Q_MVRP=y
CONFIG_BPF_JIT=y
CONFIG_CFG80211=m
CONFIG_MAC80211=m
CONFIG_MTD_M25P80=y
CONFIG_MTD_SPI_NOR=y
CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_NBD=m
CONFIG_VIRTIO_BLK=y
CONFIG_SRAM=y
# CONFIG_SCSI_PROC_FS is not set
CONFIG_PATA_PLATFORM=y
CONFIG_PATA_OF_PLATFORM=y
CONFIG_NETDEVICES=y
+CONFIG_MACVLAN=m
+CONFIG_MACVTAP=m
CONFIG_TUN=y
+CONFIG_VETH=m
CONFIG_VIRTIO_NET=y
CONFIG_AMD_XGBE=y
CONFIG_NET_XGENE=y
CONFIG_PWM_SAMSUNG=y
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_POSIX_ACL=y
CONFIG_FANOTIFY=y
CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
CONFIG_QUOTA=y
CONFIG_AUTOFS4_FS=y
-CONFIG_FUSE_FS=y
-CONFIG_CUSE=y
+CONFIG_FUSE_FS=m
+CONFIG_CUSE=m
+CONFIG_OVERLAY_FS=m
CONFIG_VFAT_FS=y
CONFIG_TMPFS=y
CONFIG_HUGETLBFS=y
#define __ARCH_WANT_KPROBES_INSN_SLOT
#define MAX_INSN_SIZE 1
-#define MAX_STACK_SIZE 128
#define flush_insn_slot(p) do { } while (0)
#define kretprobe_blacklist_size 0
struct prev_kprobe prev_kprobe;
struct kprobe_step_ctx ss_ctx;
struct pt_regs jprobe_saved_regs;
- char jprobes_stack[MAX_STACK_SIZE];
};
void arch_remove_kprobe(struct kprobe *);
lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class
cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1
b.eq el1_da
+ cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1
+ b.eq el1_ia
cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
b.eq el1_undef
cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1
b.ge el1_dbg
b el1_inv
+
+el1_ia:
+ /*
+ * Fall through to the Data abort case
+ */
el1_da:
/*
* Data abort handling
#include <asm/sections.h>
#include <asm/smp.h>
#include <asm/suspend.h>
+#include <asm/sysreg.h>
#include <asm/virt.h>
/*
set_pte(pte, __pte(virt_to_phys((void *)dst) |
pgprot_val(PAGE_KERNEL_EXEC)));
- /* Load our new page tables */
- asm volatile("msr ttbr0_el1, %0;"
- "isb;"
- "tlbi vmalle1is;"
- "dsb ish;"
- "isb" : : "r"(virt_to_phys(pgd)));
+ /*
+ * Load our new page tables. A strict BBM approach requires that we
+ * ensure that TLBs are free of any entries that may overlap with the
+ * global mappings we are about to install.
+ *
+ * For a real hibernate/resume cycle TTBR0 currently points to a zero
+ * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI
+ * runtime services), while for a userspace-driven test_resume cycle it
+ * points to userspace page tables (and we must point it at a zero page
+ * ourselves). Elsewhere we only (un)install the idmap with preemption
+ * disabled, so T0SZ should be as required regardless.
+ */
+ cpu_set_reserved_ttbr0();
+ local_flush_tlb_all();
+ write_sysreg(virt_to_phys(pgd), ttbr0_el1);
+ isb();
*phys_dst_addr = virt_to_phys((void *)dst);
void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *,
void *, phys_addr_t, phys_addr_t);
+ /*
+ * Restoring the memory image will overwrite the ttbr1 page tables.
+ * Create a second copy of just the linear map, and use this when
+ * restoring.
+ */
+ tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
+ if (!tmp_pg_dir) {
+ pr_err("Failed to allocate memory for temporary page tables.");
+ rc = -ENOMEM;
+ goto out;
+ }
+ rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0);
+ if (rc)
+ goto out;
+
+ /*
+ * Since we only copied the linear map, we need to find restore_pblist's
+ * linear map address.
+ */
+ lm_restore_pblist = LMADDR(restore_pblist);
+
+ /*
+ * We need a zero page that is zero before & after resume in order to
+ * to break before make on the ttbr1 page tables.
+ */
+ zero_page = (void *)get_safe_page(GFP_ATOMIC);
+ if (!zero_page) {
+ pr_err("Failed to allocate zero page.");
+ rc = -ENOMEM;
+ goto out;
+ }
+
/*
* Locate the exit code in the bottom-but-one page, so that *NULL
* still has disastrous affects.
*/
__flush_dcache_area(hibernate_exit, exit_size);
- /*
- * Restoring the memory image will overwrite the ttbr1 page tables.
- * Create a second copy of just the linear map, and use this when
- * restoring.
- */
- tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
- if (!tmp_pg_dir) {
- pr_err("Failed to allocate memory for temporary page tables.");
- rc = -ENOMEM;
- goto out;
- }
- rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0);
- if (rc)
- goto out;
-
- /*
- * Since we only copied the linear map, we need to find restore_pblist's
- * linear map address.
- */
- lm_restore_pblist = LMADDR(restore_pblist);
-
/*
* KASLR will cause the el2 vectors to be in a different location in
* the resumed kernel. Load hibernate's temporary copy into el2.
__hyp_set_vectors(el2_vectors);
}
- /*
- * We need a zero page that is zero before & after resume in order to
- * to break before make on the ttbr1 page tables.
- */
- zero_page = (void *)get_safe_page(GFP_ATOMIC);
-
hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1,
resume_hdr.reenter_kernel, lm_restore_pblist,
resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page));
static void __kprobes
post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *);
-static inline unsigned long min_stack_size(unsigned long addr)
-{
- unsigned long size;
-
- if (on_irq_stack(addr, raw_smp_processor_id()))
- size = IRQ_STACK_PTR(raw_smp_processor_id()) - addr;
- else
- size = (unsigned long)current_thread_info() + THREAD_START_SP - addr;
-
- return min(size, FIELD_SIZEOF(struct kprobe_ctlblk, jprobes_stack));
-}
-
static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
{
/* prepare insn slot */
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- long stack_ptr = kernel_stack_pointer(regs);
kcb->jprobe_saved_regs = *regs;
/*
- * As Linus pointed out, gcc assumes that the callee
- * owns the argument space and could overwrite it, e.g.
- * tailcall optimization. So, to be absolutely safe
- * we also save and restore enough stack bytes to cover
- * the argument area.
+ * Since we can't be sure where in the stack frame "stacked"
+ * pass-by-value arguments are stored we just don't try to
+ * duplicate any of the stack. Do not use jprobes on functions that
+ * use more than 64 bytes (after padding each to an 8 byte boundary)
+ * of arguments, or pass individual arguments larger than 16 bytes.
*/
- kasan_disable_current();
- memcpy(kcb->jprobes_stack, (void *)stack_ptr,
- min_stack_size(stack_ptr));
- kasan_enable_current();
instruction_pointer_set(regs, (unsigned long) jp->entry);
preempt_disable();
}
unpause_graph_tracing();
*regs = kcb->jprobe_saved_regs;
- kasan_disable_current();
- memcpy((void *)stack_addr, kcb->jprobes_stack,
- min_stack_size(stack_addr));
- kasan_enable_current();
preempt_enable_no_resched();
return 1;
}
acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
acpi_parse_gic_cpu_interface, 0);
- if (cpu_count > NR_CPUS)
- pr_warn("no. of cores (%d) greater than configured maximum of %d - clipping\n",
- cpu_count, NR_CPUS);
+ if (cpu_count > nr_cpu_ids)
+ pr_warn("Number of cores (%d) exceeds configured maximum of %d - clipping\n",
+ cpu_count, nr_cpu_ids);
if (!bootcpu_valid) {
pr_err("missing boot CPU MPIDR, not enabling secondaries\n");
* with entries in cpu_logical_map while initializing the cpus.
* If the cpu set-up fails, invalidate the cpu_logical_map entry.
*/
- for (i = 1; i < NR_CPUS; i++) {
+ for (i = 1; i < nr_cpu_ids; i++) {
if (cpu_logical_map(i) != INVALID_HWID) {
if (smp_cpu_setup(i))
cpu_logical_map(i) = INVALID_HWID;
}
#endif
+static bool is_el1_instruction_abort(unsigned int esr)
+{
+ return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
+}
+
/*
* The kernel tried to access some page that wasn't present.
*/
{
/*
* Are we prepared to handle this kernel fault?
+ * We are almost certainly not prepared to handle instruction faults.
*/
- if (fixup_exception(regs))
+ if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
return;
/*
unsigned int ec = ESR_ELx_EC(esr);
unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
- return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+ return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
+ (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
}
static bool is_el0_instruction_abort(unsigned int esr)
if (regs->orig_addr_limit == KERNEL_DS)
die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
+ if (is_el1_instruction_abort(esr))
+ die("Attempting to execute userspace memory", regs, esr);
+
if (!search_exception_tables(regs->pc))
die("Accessing user space memory outside uaccess.h routines", regs, esr);
}
preempt_disable();
if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
- if (kvm_mips_host_tlb_lookup(vcpu, va) < 0)
- kvm_mips_handle_kseg0_tlb_fault(va, vcpu);
+ if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 &&
+ kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) {
+ kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n",
+ __func__, va, vcpu, read_c0_entryhi());
+ er = EMULATE_FAIL;
+ preempt_enable();
+ goto done;
+ }
} else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) ||
KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
int index;
run, vcpu);
preempt_enable();
goto dont_update_pc;
- } else {
- /*
- * We fault an entry from the guest tlb to the
- * shadow host TLB
- */
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
+ }
+ /*
+ * We fault an entry from the guest tlb to the
+ * shadow host TLB
+ */
+ if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
+ kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
+ __func__, va, index, vcpu,
+ read_c0_entryhi());
+ er = EMULATE_FAIL;
+ preempt_enable();
+ goto done;
}
}
} else {
* OK we have a Guest TLB entry, now inject it into the
* shadow host TLB
*/
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
+ if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
+ kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
+ __func__, va, index, vcpu,
+ read_c0_entryhi());
+ er = EMULATE_FAIL;
+ }
}
}
}
gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
- if (gfn >= kvm->arch.guest_pmap_npages) {
+ if ((gfn | 1) >= kvm->arch.guest_pmap_npages) {
kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
gfn, badvaddr);
kvm_mips_dump_host_tlbs();
unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
struct kvm *kvm = vcpu->kvm;
kvm_pfn_t pfn0, pfn1;
+ gfn_t gfn0, gfn1;
+ long tlb_lo[2];
int ret;
- if ((tlb->tlb_hi & VPN2_MASK) == 0) {
- pfn0 = 0;
- pfn1 = 0;
- } else {
- if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0])
- >> PAGE_SHIFT) < 0)
- return -1;
-
- if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1])
- >> PAGE_SHIFT) < 0)
- return -1;
-
- pfn0 = kvm->arch.guest_pmap[
- mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT];
- pfn1 = kvm->arch.guest_pmap[
- mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT];
+ tlb_lo[0] = tlb->tlb_lo[0];
+ tlb_lo[1] = tlb->tlb_lo[1];
+
+ /*
+ * The commpage address must not be mapped to anything else if the guest
+ * TLB contains entries nearby, or commpage accesses will break.
+ */
+ if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) &
+ VPN2_MASK & (PAGE_MASK << 1)))
+ tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0;
+
+ gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT;
+ gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT;
+ if (gfn0 >= kvm->arch.guest_pmap_npages ||
+ gfn1 >= kvm->arch.guest_pmap_npages) {
+ kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n",
+ __func__, gfn0, gfn1, tlb->tlb_hi);
+ kvm_mips_dump_guest_tlbs(vcpu);
+ return -1;
}
+ if (kvm_mips_map_page(kvm, gfn0) < 0)
+ return -1;
+
+ if (kvm_mips_map_page(kvm, gfn1) < 0)
+ return -1;
+
+ pfn0 = kvm->arch.guest_pmap[gfn0];
+ pfn1 = kvm->arch.guest_pmap[gfn1];
+
/* Get attributes from the Guest TLB */
entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
- (tlb->tlb_lo[0] & ENTRYLO_D) |
- (tlb->tlb_lo[0] & ENTRYLO_V);
+ (tlb_lo[0] & ENTRYLO_D) |
+ (tlb_lo[0] & ENTRYLO_V);
entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
- (tlb->tlb_lo[1] & ENTRYLO_D) |
- (tlb->tlb_lo[1] & ENTRYLO_V);
+ (tlb_lo[1] & ENTRYLO_D) |
+ (tlb_lo[1] & ENTRYLO_V);
kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
tlb->tlb_lo[0], tlb->tlb_lo[1]);
local_irq_restore(flags);
return KVM_INVALID_INST;
}
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
- &vcpu->arch.
- guest_tlb[index]);
+ if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
+ &vcpu->arch.guest_tlb[index])) {
+ kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n",
+ __func__, opc, index, vcpu,
+ read_c0_entryhi());
+ kvm_mips_dump_guest_tlbs(vcpu);
+ local_irq_restore(flags);
+ return KVM_INVALID_INST;
+ }
inst = *(opc);
}
local_irq_restore(flags);
xics->kvm = kvm;
/* Already there ? */
- mutex_lock(&kvm->lock);
if (kvm->arch.xics)
ret = -EEXIST;
else
kvm->arch.xics = xics;
- mutex_unlock(&kvm->lock);
if (ret) {
kfree(xics);
return ret;
}
- xics_debugfs_init(xics);
-
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
if (cpu_has_feature(CPU_FTR_ARCH_206)) {
/* Enable real mode support */
return 0;
}
+static void kvmppc_xics_init(struct kvm_device *dev)
+{
+ struct kvmppc_xics *xics = (struct kvmppc_xics *)dev->private;
+
+ xics_debugfs_init(xics);
+}
+
struct kvm_device_ops kvm_xics_ops = {
.name = "kvm-xics",
.create = kvmppc_xics_create,
+ .init = kvmppc_xics_init,
.destroy = kvmppc_xics_free,
.set_attr = xics_set_attr,
.get_attr = xics_get_attr,
KVM_SYNC_CRS |
KVM_SYNC_ARCH0 |
KVM_SYNC_PFAULT;
+ kvm_s390_set_prefix(vcpu, 0);
if (test_kvm_facility(vcpu->kvm, 64))
vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
/* fprs can be synchronized via vrs, even if the guest has no vx. With
rc = gmap_mprotect_notify(vcpu->arch.gmap,
kvm_s390_get_prefix(vcpu),
PAGE_SIZE * 2, PROT_WRITE);
- if (rc)
+ if (rc) {
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
return rc;
+ }
goto retry;
}
reinit_completion(&dev->ioq_wait);
retry:
timeout = ADMIN_TIMEOUT;
- for (; i > 0; i--) {
- struct nvme_queue *nvmeq = dev->queues[i];
-
- if (!pass)
- nvme_suspend_queue(nvmeq);
- if (nvme_delete_queue(nvmeq, opcode))
+ for (; i > 0; i--, sent++)
+ if (nvme_delete_queue(dev->queues[i], opcode))
break;
- ++sent;
- }
+
while (sent--) {
timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
if (timeout == 0)
nvme_stop_queues(&dev->ctrl);
csts = readl(dev->bar + NVME_REG_CSTS);
}
+
+ for (i = dev->queue_count - 1; i > 0; i--)
+ nvme_suspend_queue(dev->queues[i]);
+
if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
- for (i = dev->queue_count - 1; i >= 0; i--) {
- struct nvme_queue *nvmeq = dev->queues[i];
- nvme_suspend_queue(nvmeq);
- }
+ nvme_suspend_queue(dev->queues[0]);
} else {
nvme_disable_io_queues(dev);
nvme_disable_admin_queue(dev, shutdown);
* more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/delay.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/string.h>
-#include <linux/jiffies.h>
#include <linux/atomic.h>
#include <linux/blk-mq.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/scatterlist.h>
#include <linux/nvme.h>
-#include <linux/t10-pi.h>
#include <asm/unaligned.h>
#include <rdma/ib_verbs.h>
static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
-static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
/* XXX: really should move to a generic header sooner or later.. */
static inline void put_unaligned_le24(u32 val, u8 *p)
list_del(&ctrl->list);
mutex_unlock(&nvme_rdma_ctrl_mutex);
- if (ctrl->ctrl.tagset) {
- blk_cleanup_queue(ctrl->ctrl.connect_q);
- blk_mq_free_tag_set(&ctrl->tag_set);
- nvme_rdma_dev_put(ctrl->device);
- }
kfree(ctrl->queues);
nvmf_free_options(nctrl->opts);
free_ctrl:
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
- if (ctrl->queue_count > 1)
+ if (ctrl->queue_count > 1) {
nvme_start_queues(&ctrl->ctrl);
+ nvme_queue_scan(&ctrl->ctrl);
+ nvme_queue_async_events(&ctrl->ctrl);
+ }
dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
{
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
struct rdma_conn_param param = { };
- struct nvme_rdma_cm_req priv;
+ struct nvme_rdma_cm_req priv = { };
int ret;
param.qp_num = queue->qp->qp_num;
* that caught the event. Since we hold the callout until the controller
* deletion is completed, we'll deadlock if the controller deletion will
* call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
- * of destroying this queue before-hand, destroy the queue resources
- * after the controller deletion completed with the exception of destroying
- * the cm_id implicitely by returning a non-zero rc to the callout.
+ * of destroying this queue before-hand, destroy the queue resources,
+ * then queue the controller deletion which won't destroy this queue and
+ * we destroy the cm_id implicitely by returning a non-zero rc to the callout.
*/
static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
{
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
- int ret, ctrl_deleted = 0;
+ int ret;
- /* First disable the queue so ctrl delete won't free it */
- if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
- goto out;
+ /* Own the controller deletion */
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
+ return 0;
- /* delete the controller */
- ret = __nvme_rdma_del_ctrl(ctrl);
- if (!ret) {
- dev_warn(ctrl->ctrl.device,
- "Got rdma device removal event, deleting ctrl\n");
- flush_work(&ctrl->delete_work);
+ dev_warn(ctrl->ctrl.device,
+ "Got rdma device removal event, deleting ctrl\n");
- /* Return non-zero so the cm_id will destroy implicitly */
- ctrl_deleted = 1;
+ /* Get rid of reconnect work if its running */
+ cancel_delayed_work_sync(&ctrl->reconnect_work);
+ /* Disable the queue so ctrl delete won't free it */
+ if (test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
/* Free this queue ourselves */
- rdma_disconnect(queue->cm_id);
- ib_drain_qp(queue->qp);
+ nvme_rdma_stop_queue(queue);
nvme_rdma_destroy_queue_ib(queue);
+
+ /* Return non-zero so the cm_id will destroy implicitly */
+ ret = 1;
}
-out:
- return ctrl_deleted;
+ /* Queue controller deletion */
+ queue_work(nvme_rdma_wq, &ctrl->delete_work);
+ flush_work(&ctrl->delete_work);
+ return ret;
}
static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
nvme_rdma_free_io_queues(ctrl);
}
- if (ctrl->ctrl.state == NVME_CTRL_LIVE)
+ if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags))
nvme_shutdown_ctrl(&ctrl->ctrl);
blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl);
}
+static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
+{
+ nvme_uninit_ctrl(&ctrl->ctrl);
+ if (shutdown)
+ nvme_rdma_shutdown_ctrl(ctrl);
+
+ if (ctrl->ctrl.tagset) {
+ blk_cleanup_queue(ctrl->ctrl.connect_q);
+ blk_mq_free_tag_set(&ctrl->tag_set);
+ nvme_rdma_dev_put(ctrl->device);
+ }
+
+ nvme_put_ctrl(&ctrl->ctrl);
+}
+
static void nvme_rdma_del_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl = container_of(work,
struct nvme_rdma_ctrl, delete_work);
- nvme_remove_namespaces(&ctrl->ctrl);
- nvme_rdma_shutdown_ctrl(ctrl);
- nvme_uninit_ctrl(&ctrl->ctrl);
- nvme_put_ctrl(&ctrl->ctrl);
+ __nvme_rdma_remove_ctrl(ctrl, true);
}
static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
struct nvme_rdma_ctrl *ctrl = container_of(work,
struct nvme_rdma_ctrl, delete_work);
- nvme_remove_namespaces(&ctrl->ctrl);
- nvme_uninit_ctrl(&ctrl->ctrl);
- nvme_put_ctrl(&ctrl->ctrl);
+ __nvme_rdma_remove_ctrl(ctrl, false);
}
static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
if (ctrl->queue_count > 1) {
nvme_start_queues(&ctrl->ctrl);
nvme_queue_scan(&ctrl->ctrl);
+ nvme_queue_async_events(&ctrl->ctrl);
}
return;
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
-#include <linux/random.h>
#include <generated/utsrelease.h>
#include "nvmet.h"
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvme_id_ctrl *id;
- u64 serial;
u16 status = 0;
id = kzalloc(sizeof(*id), GFP_KERNEL);
id->vid = 0;
id->ssvid = 0;
- /* generate a random serial number as our controllers are ephemeral: */
- get_random_bytes(&serial, sizeof(serial));
memset(id->sn, ' ', sizeof(id->sn));
- snprintf(id->sn, sizeof(id->sn), "%llx", serial);
+ snprintf(id->sn, sizeof(id->sn), "%llx", ctrl->serial);
memset(id->mn, ' ', sizeof(id->mn));
strncpy((char *)id->mn, "Linux", sizeof(id->mn));
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
+#include <linux/random.h>
#include "nvmet.h"
static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
+ /* generate a random serial number as our controllers are ephemeral: */
+ get_random_bytes(&ctrl->serial, sizeof(ctrl->serial));
+
kref_init(&ctrl->ref);
ctrl->subsys = subsys;
struct nvme_loop_ctrl *ctrl = container_of(work,
struct nvme_loop_ctrl, delete_work);
- nvme_remove_namespaces(&ctrl->ctrl);
- nvme_loop_shutdown_ctrl(ctrl);
nvme_uninit_ctrl(&ctrl->ctrl);
+ nvme_loop_shutdown_ctrl(ctrl);
nvme_put_ctrl(&ctrl->ctrl);
}
nvme_loop_destroy_admin_queue(ctrl);
out_disable:
dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
- nvme_remove_namespaces(&ctrl->ctrl);
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
}
struct mutex lock;
u64 cap;
+ u64 serial;
u32 cc;
u32 csts;
NVMET_RDMA_Q_CONNECTING,
NVMET_RDMA_Q_LIVE,
NVMET_RDMA_Q_DISCONNECTING,
+ NVMET_RDMA_IN_DEVICE_REMOVAL,
};
struct nvmet_rdma_queue {
if (!len)
return 0;
- /* use the already allocated data buffer if possible */
- if (len <= NVMET_RDMA_INLINE_DATA_SIZE && rsp->queue->host_qid) {
- nvmet_rdma_use_inline_sg(rsp, len, 0);
- } else {
- status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
- len);
- if (status)
- return status;
- }
+ status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
+ len);
+ if (status)
+ return status;
ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
struct nvmet_rdma_device *dev = queue->dev;
nvmet_rdma_free_queue(queue);
- rdma_destroy_id(cm_id);
+
+ if (queue->state != NVMET_RDMA_IN_DEVICE_REMOVAL)
+ rdma_destroy_id(cm_id);
+
kref_put(&dev->ref, nvmet_rdma_free_dev);
}
switch (queue->state) {
case NVMET_RDMA_Q_CONNECTING:
case NVMET_RDMA_Q_LIVE:
- disconnect = true;
queue->state = NVMET_RDMA_Q_DISCONNECTING;
+ case NVMET_RDMA_IN_DEVICE_REMOVAL:
+ disconnect = true;
break;
case NVMET_RDMA_Q_DISCONNECTING:
break;
schedule_work(&queue->release_work);
}
+/**
+ * nvme_rdma_device_removal() - Handle RDMA device removal
+ * @queue: nvmet rdma queue (cm id qp_context)
+ * @addr: nvmet address (cm_id context)
+ *
+ * DEVICE_REMOVAL event notifies us that the RDMA device is about
+ * to unplug so we should take care of destroying our RDMA resources.
+ * This event will be generated for each allocated cm_id.
+ *
+ * Note that this event can be generated on a normal queue cm_id
+ * and/or a device bound listener cm_id (where in this case
+ * queue will be null).
+ *
+ * we claim ownership on destroying the cm_id. For queues we move
+ * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port
+ * we nullify the priv to prevent double cm_id destruction and destroying
+ * the cm_id implicitely by returning a non-zero rc to the callout.
+ */
+static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
+ struct nvmet_rdma_queue *queue)
+{
+ unsigned long flags;
+
+ if (!queue) {
+ struct nvmet_port *port = cm_id->context;
+
+ /*
+ * This is a listener cm_id. Make sure that
+ * future remove_port won't invoke a double
+ * cm_id destroy. use atomic xchg to make sure
+ * we don't compete with remove_port.
+ */
+ if (xchg(&port->priv, NULL) != cm_id)
+ return 0;
+ } else {
+ /*
+ * This is a queue cm_id. Make sure that
+ * release queue will not destroy the cm_id
+ * and schedule all ctrl queues removal (only
+ * if the queue is not disconnecting already).
+ */
+ spin_lock_irqsave(&queue->state_lock, flags);
+ if (queue->state != NVMET_RDMA_Q_DISCONNECTING)
+ queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL;
+ spin_unlock_irqrestore(&queue->state_lock, flags);
+ nvmet_rdma_queue_disconnect(queue);
+ flush_scheduled_work();
+ }
+
+ /*
+ * We need to return 1 so that the core will destroy
+ * it's own ID. What a great API design..
+ */
+ return 1;
+}
+
static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
break;
case RDMA_CM_EVENT_ADDR_CHANGE:
case RDMA_CM_EVENT_DISCONNECTED:
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
- /*
- * We can get the device removal callback even for a
- * CM ID that we aren't actually using. In that case
- * the context pointer is NULL, so we shouldn't try
- * to disconnect a non-existing queue. But we also
- * need to return 1 so that the core will destroy
- * it's own ID. What a great API design..
- */
- if (queue)
- nvmet_rdma_queue_disconnect(queue);
- else
- ret = 1;
+ nvmet_rdma_queue_disconnect(queue);
+ break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ ret = nvmet_rdma_device_removal(cm_id, queue);
break;
case RDMA_CM_EVENT_REJECTED:
case RDMA_CM_EVENT_UNREACHABLE:
static void nvmet_rdma_remove_port(struct nvmet_port *port)
{
- struct rdma_cm_id *cm_id = port->priv;
+ struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
- rdma_destroy_id(cm_id);
+ if (cm_id)
+ rdma_destroy_id(cm_id);
}
static struct nvmet_fabrics_ops nvmet_rdma_ops = {
return 0;
}
-static DEFINE_MUTEX(arm_pmu_mutex);
+static DEFINE_SPINLOCK(arm_pmu_lock);
static LIST_HEAD(arm_pmu_list);
/*
{
struct arm_pmu *pmu;
- mutex_lock(&arm_pmu_mutex);
+ spin_lock(&arm_pmu_lock);
list_for_each_entry(pmu, &arm_pmu_list, entry) {
if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
if (pmu->reset)
pmu->reset(pmu);
}
- mutex_unlock(&arm_pmu_mutex);
+ spin_unlock(&arm_pmu_lock);
return 0;
}
if (!cpu_hw_events)
return -ENOMEM;
- mutex_lock(&arm_pmu_mutex);
+ spin_lock(&arm_pmu_lock);
list_add_tail(&cpu_pmu->entry, &arm_pmu_list);
- mutex_unlock(&arm_pmu_mutex);
+ spin_unlock(&arm_pmu_lock);
err = cpu_pm_pmu_register(cpu_pmu);
if (err)
return 0;
out_unregister:
- mutex_lock(&arm_pmu_mutex);
+ spin_lock(&arm_pmu_lock);
list_del(&cpu_pmu->entry);
- mutex_unlock(&arm_pmu_mutex);
+ spin_unlock(&arm_pmu_lock);
free_percpu(cpu_hw_events);
return err;
}
static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
{
cpu_pm_pmu_unregister(cpu_pmu);
- mutex_lock(&arm_pmu_mutex);
+ spin_lock(&arm_pmu_lock);
list_del(&cpu_pmu->entry);
- mutex_unlock(&arm_pmu_mutex);
+ spin_unlock(&arm_pmu_lock);
free_percpu(cpu_pmu->hw_events);
}
/* If we didn't manage to parse anything, try the interrupt affinity */
if (cpumask_weight(&pmu->supported_cpus) == 0) {
- if (!using_spi) {
+ int irq = platform_get_irq(pdev, 0);
+
+ if (irq_is_percpu(irq)) {
/* If using PPIs, check the affinity of the partition */
- int ret, irq;
+ int ret;
- irq = platform_get_irq(pdev, 0);
ret = irq_get_percpu_devid_partition(irq, &pmu->supported_cpus);
if (ret) {
kfree(irqs);
{
struct backing_dev_info *bdi;
+ /*
+ * If we are expecting writeback progress we must submit plugged IO.
+ */
+ if (blk_needs_flush_plug(current))
+ blk_schedule_flush_plug(current);
+
if (!nr_pages)
nr_pages = get_nr_dirty_pages();
"Attempted to advance past end of bvec iter\n");
while (bytes) {
- unsigned len = min(bytes, bvec_iter_len(bv, *iter));
+ unsigned iter_len = bvec_iter_len(bv, *iter);
+ unsigned len = min(bytes, iter_len);
bytes -= len;
iter->bi_size -= len;
/* create, destroy, and name are mandatory */
struct kvm_device_ops {
const char *name;
+
+ /*
+ * create is called holding kvm->lock and any operations not suitable
+ * to do while holding the lock should be deferred to init (see
+ * below).
+ */
int (*create)(struct kvm_device *dev, u32 type);
+ /*
+ * init is called after create if create is successful and is called
+ * outside of holding kvm->lock.
+ */
+ void (*init)(struct kvm_device *dev);
+
/*
* Destroy is responsible for freeing dev.
*
int i, vcpu_lock_idx = -1, ret;
struct kvm_vcpu *vcpu;
- mutex_lock(&kvm->lock);
-
- if (irqchip_in_kernel(kvm)) {
- ret = -EEXIST;
- goto out;
- }
+ if (irqchip_in_kernel(kvm))
+ return -EEXIST;
/*
* This function is also called by the KVM_CREATE_IRQCHIP handler,
* the proper checks already.
*/
if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
- !kvm_vgic_global_state.can_emulate_gicv2) {
- ret = -ENODEV;
- goto out;
- }
+ !kvm_vgic_global_state.can_emulate_gicv2)
+ return -ENODEV;
/*
* Any time a vcpu is run, vcpu_load is called which tries to grab the
vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
mutex_unlock(&vcpu->mutex);
}
-
-out:
- mutex_unlock(&kvm->lock);
return ret;
}
{
struct kvm_device *dev, *tmp;
+ /*
+ * We do not need to take the kvm->lock here, because nobody else
+ * has a reference to the struct kvm at this point and therefore
+ * cannot access the devices list anyhow.
+ */
list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
list_del(&dev->vm_node);
dev->ops->destroy(dev);
dev->ops = ops;
dev->kvm = kvm;
+ mutex_lock(&kvm->lock);
ret = ops->create(dev, cd->type);
if (ret < 0) {
+ mutex_unlock(&kvm->lock);
kfree(dev);
return ret;
}
+ list_add(&dev->vm_node, &kvm->devices);
+ mutex_unlock(&kvm->lock);
+
+ if (ops->init)
+ ops->init(dev);
ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
if (ret < 0) {
ops->destroy(dev);
+ mutex_lock(&kvm->lock);
+ list_del(&dev->vm_node);
+ mutex_unlock(&kvm->lock);
return ret;
}
- list_add(&dev->vm_node, &kvm->devices);
kvm_get_kvm(kvm);
cd->fd = ret;
return 0;