Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Aug 2016 01:08:07 +0000 (21:08 -0400)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Aug 2016 01:08:07 +0000 (21:08 -0400)
Merge yet more updates from Andrew Morton:

 - the rest of ocfs2

 - various hotfixes, mainly MM

 - quite a bit of misc stuff - drivers, fork, exec, signals, etc.

 - printk updates

 - firmware

 - checkpatch

 - nilfs2

 - more kexec stuff than usual

 - rapidio updates

 - w1 things

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (111 commits)
  ipc: delete "nr_ipc_ns"
  kcov: allow more fine-grained coverage instrumentation
  init/Kconfig: add clarification for out-of-tree modules
  config: add android config fragments
  init/Kconfig: ban CONFIG_LOCALVERSION_AUTO with allmodconfig
  relay: add global mode support for buffer-only channels
  init: allow blacklisting of module_init functions
  w1:omap_hdq: fix regression
  w1: add helper macro module_w1_family
  w1: remove need for ida and use PLATFORM_DEVID_AUTO
  rapidio/switches: add driver for IDT gen3 switches
  powerpc/fsl_rio: apply changes for RIO spec rev 3
  rapidio: modify for rev.3 specification changes
  rapidio: change inbound window size type to u64
  rapidio/idt_gen2: fix locking warning
  rapidio: fix error handling in mbox request/release functions
  rapidio/tsi721_dma: advance queue processing from transfer submit call
  rapidio/tsi721: add messaging mbox selector parameter
  rapidio/tsi721: add PCIe MRRS override parameter
  rapidio/tsi721_dma: add channel mask and queue size parameters
  ...

394 files changed:
.cocciconfig [new file with mode: 0644]
.gitignore
Documentation/PCI/MSI-HOWTO.txt
Documentation/coccinelle.txt
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
Documentation/devicetree/bindings/mtd/atmel-quadspi.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
Documentation/devicetree/bindings/mtd/cadence-quadspi.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mtd/gpmc-nand.txt
Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mtd/mtk-nand.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mtd/sunxi-nand.txt
Documentation/devicetree/bindings/pci/aardvark-pci.txt [new file with mode: 0644]
Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt [new file with mode: 0644]
Documentation/dontdiff
Documentation/filesystems/orangefs.txt
Documentation/gcc-plugins.txt [new file with mode: 0644]
Documentation/kernel-parameters.txt
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/devices/arm-vgic.txt
Documentation/virtual/kvm/devices/vm.txt
Documentation/virtual/kvm/locking.txt
MAINTAINERS
Makefile
arch/Kconfig
arch/alpha/boot/Makefile
arch/arm/Kconfig
arch/arm/include/asm/kvm_asm.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_hyp.h
arch/arm/include/asm/kvm_mmu.h
arch/arm/include/asm/mach/pci.h
arch/arm/include/asm/pgtable.h
arch/arm/include/asm/virt.h
arch/arm/kernel/bios32.c
arch/arm/kvm/Kconfig
arch/arm/kvm/Makefile
arch/arm/kvm/arm.c
arch/arm/kvm/emulate.c
arch/arm/kvm/guest.c
arch/arm/kvm/init.S
arch/arm/kvm/mmu.c
arch/arm/kvm/reset.c
arch/arm64/Kconfig
arch/arm64/boot/dts/marvell/armada-3720-db.dts
arch/arm64/boot/dts/marvell/armada-37xx.dtsi
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_hyp.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/pgtable-hwdef.h
arch/arm64/include/asm/pgtable-prot.h
arch/arm64/include/asm/virt.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/pci.c
arch/arm64/kvm/Kconfig
arch/arm64/kvm/Makefile
arch/arm64/kvm/guest.c
arch/arm64/kvm/hyp-init.S
arch/arm64/kvm/hyp/entry.S
arch/arm64/kvm/hyp/hyp-entry.S
arch/arm64/kvm/hyp/switch.c
arch/arm64/kvm/reset.c
arch/arm64/kvm/sys_regs.c
arch/cris/arch-v10/drivers/axisflashmap.c
arch/cris/arch-v32/drivers/axisflashmap.c
arch/microblaze/include/asm/pci.h
arch/microblaze/pci/pci-common.c
arch/mips/Kconfig
arch/mips/include/asm/addrspace.h
arch/mips/include/asm/kvm_host.h
arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
arch/mips/include/asm/mipsregs.h
arch/mips/include/asm/pci.h
arch/mips/include/asm/setup.h
arch/mips/include/asm/uasm.h
arch/mips/include/uapi/asm/inst.h
arch/mips/kernel/asm-offsets.c
arch/mips/kernel/branch.c
arch/mips/kernel/traps.c
arch/mips/kvm/Kconfig
arch/mips/kvm/Makefile
arch/mips/kvm/commpage.c
arch/mips/kvm/dyntrans.c
arch/mips/kvm/emulate.c
arch/mips/kvm/entry.c [new file with mode: 0644]
arch/mips/kvm/fpu.S
arch/mips/kvm/interrupt.c
arch/mips/kvm/interrupt.h
arch/mips/kvm/locore.S [deleted file]
arch/mips/kvm/mips.c
arch/mips/kvm/mmu.c [new file with mode: 0644]
arch/mips/kvm/stats.c
arch/mips/kvm/tlb.c
arch/mips/kvm/trace.h
arch/mips/kvm/trap_emul.c
arch/mips/math-emu/cp1emu.c
arch/mips/mm/c-r4k.c
arch/mips/mm/uasm-micromips.c
arch/mips/mm/uasm-mips.c
arch/mips/mm/uasm.c
arch/mips/pci/pci.c
arch/powerpc/boot/Makefile
arch/powerpc/include/asm/hmi.h [new file with mode: 0644]
arch/powerpc/include/asm/paca.h
arch/powerpc/include/asm/pci.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/hmi.c [new file with mode: 0644]
arch/powerpc/kernel/idle_book3s.S
arch/powerpc/kernel/pci-common.c
arch/powerpc/kernel/traps.c
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_ras.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/mpic.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/s390/boot/compressed/Makefile
arch/s390/hypfs/hypfs_diag.c
arch/s390/include/asm/cpacf.h
arch/s390/include/asm/diag.h
arch/s390/include/asm/gmap.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/mmu.h
arch/s390/include/asm/mmu_context.h
arch/s390/include/asm/page.h
arch/s390/include/asm/pgalloc.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/processor.h
arch/s390/include/asm/sclp.h
arch/s390/include/uapi/asm/kvm.h
arch/s390/include/uapi/asm/sie.h
arch/s390/kernel/diag.c
arch/s390/kvm/Makefile
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/guestdbg.c
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/kvm/sigp.c
arch/s390/kvm/sthyi.c [new file with mode: 0644]
arch/s390/kvm/trace.h
arch/s390/kvm/vsie.c [new file with mode: 0644]
arch/s390/mm/fault.c
arch/s390/mm/gmap.c
arch/s390/mm/pgalloc.c
arch/s390/mm/pgtable.c
arch/sparc/include/asm/pci_64.h
arch/sparc/kernel/pci.c
arch/um/Kconfig.common
arch/um/Makefile
arch/unicore32/kernel/pci.c
arch/x86/Kconfig
arch/x86/boot/Makefile
arch/x86/entry/vdso/Makefile
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/svm.h
arch/x86/include/asm/virtext.h
arch/x86/kvm/Kconfig
arch/x86/kvm/i8254.c
arch/x86/kvm/iommu.c
arch/x86/kvm/irq_comm.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/pmu_intel.c
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/pci/common.c
arch/x86/pci/vmd.c
arch/x86/purgatory/Makefile
arch/x86/realmode/rm/Makefile
drivers/acpi/Kconfig
drivers/acpi/Makefile
drivers/acpi/pci_mcfg.c [new file with mode: 0644]
drivers/acpi/pci_root.c
drivers/block/rbd.c
drivers/gpu/drm/drm_dp_helper.c
drivers/gpu/drm/i915/intel_drv.h
drivers/gpu/drm/i915/intel_psr.c
drivers/gpu/drm/i915/intel_sprite.c
drivers/irqchip/Kconfig
drivers/memory/Kconfig
drivers/memory/fsl_ifc.c
drivers/misc/genwqe/card_base.c
drivers/mtd/chips/cfi_cmdset_0020.c
drivers/mtd/devices/Kconfig
drivers/mtd/devices/m25p80.c
drivers/mtd/maps/physmap_of.c
drivers/mtd/maps/pmcmsp-flash.c
drivers/mtd/maps/sa1100-flash.c
drivers/mtd/nand/Kconfig
drivers/mtd/nand/Makefile
drivers/mtd/nand/brcmnand/brcmnand.c
drivers/mtd/nand/jz4780_bch.c
drivers/mtd/nand/jz4780_nand.c
drivers/mtd/nand/mtk_ecc.c [new file with mode: 0644]
drivers/mtd/nand/mtk_ecc.h [new file with mode: 0644]
drivers/mtd/nand/mtk_nand.c [new file with mode: 0644]
drivers/mtd/nand/nand_base.c
drivers/mtd/nand/nand_ids.c
drivers/mtd/nand/omap2.c
drivers/mtd/nand/sunxi_nand.c
drivers/mtd/nand/xway_nand.c
drivers/mtd/onenand/onenand_base.c
drivers/mtd/spi-nor/Kconfig
drivers/mtd/spi-nor/Makefile
drivers/mtd/spi-nor/atmel-quadspi.c [new file with mode: 0644]
drivers/mtd/spi-nor/cadence-quadspi.c [new file with mode: 0644]
drivers/mtd/spi-nor/fsl-quadspi.c
drivers/mtd/spi-nor/hisi-sfc.c [new file with mode: 0644]
drivers/mtd/spi-nor/mtk-quadspi.c
drivers/mtd/spi-nor/nxp-spifi.c
drivers/mtd/spi-nor/spi-nor.c
drivers/mtd/ssfdc.c
drivers/mtd/tests/nandbiterrs.c
drivers/net/ethernet/atheros/alx/main.c
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/fm10k/fm10k_pci.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/igb/igb_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
drivers/nvme/host/pci.c
drivers/pci/Kconfig
drivers/pci/bus.c
drivers/pci/ecam.c
drivers/pci/ecam.h [deleted file]
drivers/pci/host/Kconfig
drivers/pci/host/Makefile
drivers/pci/host/pci-aardvark.c [new file with mode: 0644]
drivers/pci/host/pci-dra7xx.c
drivers/pci/host/pci-host-common.c
drivers/pci/host/pci-host-generic.c
drivers/pci/host/pci-hyperv.c
drivers/pci/host/pci-keystone.c
drivers/pci/host/pci-layerscape.c
drivers/pci/host/pci-mvebu.c
drivers/pci/host/pci-rcar-gen2.c
drivers/pci/host/pci-tegra.c
drivers/pci/host/pci-thunder-ecam.c
drivers/pci/host/pci-thunder-pem.c
drivers/pci/host/pci-versatile.c
drivers/pci/host/pci-xgene.c
drivers/pci/host/pcie-altera.c
drivers/pci/host/pcie-armada8k.c
drivers/pci/host/pcie-artpec6.c [new file with mode: 0644]
drivers/pci/host/pcie-designware-plat.c
drivers/pci/host/pcie-designware.c
drivers/pci/host/pcie-hisi.c
drivers/pci/host/pcie-iproc.c
drivers/pci/host/pcie-rcar.c
drivers/pci/host/pcie-xilinx-nwl.c
drivers/pci/host/pcie-xilinx.c
drivers/pci/hotplug/acpiphp_glue.c
drivers/pci/hotplug/pciehp_hpc.c
drivers/pci/msi.c
drivers/pci/pci-driver.c
drivers/pci/pci-sysfs.c
drivers/pci/pci.c
drivers/pci/pci.h
drivers/pci/pcie/Kconfig
drivers/pci/pcie/aspm.c
drivers/pci/pcie/pcie-dpc.c
drivers/pci/pcie/portdrv_core.c
drivers/pci/pcie/portdrv_pci.c
drivers/pci/probe.c
drivers/pci/proc.c
drivers/pci/quirks.c
drivers/pci/remove.c
drivers/pci/setup-bus.c
drivers/s390/char/sclp_early.c
drivers/s390/char/sclp_ocf.c
drivers/scsi/lpfc/lpfc_init.c
drivers/usb/host/xhci-pci.c
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/caps.c
fs/ceph/dir.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/ioctl.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/snap.c
fs/ceph/super.c
fs/ceph/super.h
fs/ceph/xattr.c
fs/orangefs/dcache.c
fs/orangefs/inode.c
fs/orangefs/namei.c
fs/orangefs/orangefs-kernel.h
fs/orangefs/orangefs-mod.c
fs/orangefs/orangefs-sysfs.c
fs/orangefs/orangefs-utils.c
fs/orangefs/protocol.h
include/asm-generic/vmlinux.lds.h
include/drm/drm_dp_helper.h
include/kvm/arm_vgic.h
include/kvm/vgic/vgic.h [deleted file]
include/linux/ceph/ceph_fs.h
include/linux/ceph/decode.h
include/linux/ceph/libceph.h
include/linux/ceph/mon_client.h
include/linux/ceph/msgpool.h
include/linux/ceph/osd_client.h
include/linux/ceph/osdmap.h
include/linux/ceph/string_table.h [new file with mode: 0644]
include/linux/context_tracking.h
include/linux/export.h
include/linux/irqchip/arm-gic-v3.h
include/linux/kconfig.h
include/linux/kvm_host.h
include/linux/mtd/nand.h
include/linux/mtd/spi-nor.h
include/linux/page_ref.h
include/linux/pci-acpi.h
include/linux/pci-ecam.h [new file with mode: 0644]
include/linux/pci.h
include/trace/events/kvm.h
include/uapi/linux/kvm.h
lib/Kconfig.debug
mm/gup.c
net/ceph/Makefile
net/ceph/ceph_common.c
net/ceph/ceph_fs.c
net/ceph/debugfs.c
net/ceph/mon_client.c
net/ceph/msgpool.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/ceph/string_table.c [new file with mode: 0644]
scripts/Kbuild.include
scripts/Makefile
scripts/Makefile.build
scripts/Makefile.clean
scripts/Makefile.gcc-plugins [new file with mode: 0644]
scripts/Makefile.host
scripts/Makefile.lib
scripts/basic/bin2c.c
scripts/coccicheck
scripts/coccinelle/free/devm_free.cocci
scripts/coccinelle/free/ifnullfree.cocci
scripts/coccinelle/free/kfree.cocci
scripts/coccinelle/free/kfreeaddr.cocci
scripts/coccinelle/iterators/device_node_continue.cocci
scripts/coccinelle/misc/noderef.cocci
scripts/gcc-plugin.sh [new file with mode: 0755]
scripts/gcc-plugins/Makefile [new file with mode: 0644]
scripts/gcc-plugins/cyc_complexity_plugin.c [new file with mode: 0644]
scripts/gcc-plugins/gcc-common.h [new file with mode: 0644]
scripts/gcc-plugins/gcc-generate-gimple-pass.h [new file with mode: 0644]
scripts/gcc-plugins/gcc-generate-ipa-pass.h [new file with mode: 0644]
scripts/gcc-plugins/gcc-generate-rtl-pass.h [new file with mode: 0644]
scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h [new file with mode: 0644]
scripts/gcc-plugins/sancov_plugin.c [new file with mode: 0644]
scripts/link-vmlinux.sh
scripts/package/builddeb
scripts/setlocalversion
virt/kvm/Kconfig
virt/kvm/arm/hyp/vgic-v2-sr.c
virt/kvm/arm/vgic-v2-emul.c [deleted file]
virt/kvm/arm/vgic-v2.c [deleted file]
virt/kvm/arm/vgic-v3-emul.c [deleted file]
virt/kvm/arm/vgic-v3.c [deleted file]
virt/kvm/arm/vgic.c [deleted file]
virt/kvm/arm/vgic.h [deleted file]
virt/kvm/arm/vgic/vgic-init.c
virt/kvm/arm/vgic/vgic-its.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-kvm-device.c
virt/kvm/arm/vgic/vgic-mmio-v2.c
virt/kvm/arm/vgic/vgic-mmio-v3.c
virt/kvm/arm/vgic/vgic-mmio.c
virt/kvm/arm/vgic/vgic-mmio.h
virt/kvm/arm/vgic/vgic-v2.c
virt/kvm/arm/vgic/vgic-v3.c
virt/kvm/arm/vgic/vgic.c
virt/kvm/arm/vgic/vgic.h
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

diff --git a/.cocciconfig b/.cocciconfig
new file mode 100644 (file)
index 0000000..43967c6
--- /dev/null
@@ -0,0 +1,3 @@
+[spatch]
+       options = --timeout 200
+       options = --use-gitgrep
index 0c320bf..c2ed4ec 100644 (file)
@@ -37,6 +37,7 @@ modules.builtin
 Module.symvers
 *.dwo
 *.su
+*.c.[012]*.*
 
 #
 # Top-level generic files
@@ -66,6 +67,7 @@ Module.symvers
 #
 !.gitignore
 !.mailmap
+!.cocciconfig
 
 #
 # Generated include files
index 1179850..c55df29 100644 (file)
@@ -78,422 +78,111 @@ CONFIG_PCI_MSI option.
 
 4.2 Using MSI
 
-Most of the hard work is done for the driver in the PCI layer.  It simply
-has to request that the PCI layer set up the MSI capability for this
+Most of the hard work is done for the driver in the PCI layer.  The driver
+simply has to request that the PCI layer set up the MSI capability for this
 device.
 
-4.2.1 pci_enable_msi
+To automatically use MSI or MSI-X interrupt vectors, use the following
+function:
 
-int pci_enable_msi(struct pci_dev *dev)
+  int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+               unsigned int max_vecs, unsigned int flags);
 
-A successful call allocates ONE interrupt to the device, regardless
-of how many MSIs the device supports.  The device is switched from
-pin-based interrupt mode to MSI mode.  The dev->irq number is changed
-to a new number which represents the message signaled interrupt;
-consequently, this function should be called before the driver calls
-request_irq(), because an MSI is delivered via a vector that is
-different from the vector of a pin-based interrupt.
+which allocates up to max_vecs interrupt vectors for a PCI device.  It
+returns the number of vectors allocated or a negative error.  If the device
+has a requirements for a minimum number of vectors the driver can pass a
+min_vecs argument set to this limit, and the PCI core will return -ENOSPC
+if it can't meet the minimum number of vectors.
 
-4.2.2 pci_enable_msi_range
+The flags argument should normally be set to 0, but can be used to pass the
+PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support
+MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in
+case the device does not support legacy interrupt lines.
 
-int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+By default this function will spread the interrupts around the available
+CPUs, but this feature can be disabled by passing the PCI_IRQ_NOAFFINITY
+flag.
 
-This function allows a device driver to request any number of MSI
-interrupts within specified range from 'minvec' to 'maxvec'.
+To get the Linux IRQ numbers passed to request_irq() and free_irq() and the
+vectors, use the following function:
 
-If this function returns a positive number it indicates the number of
-MSI interrupts that have been successfully allocated.  In this case
-the device is switched from pin-based interrupt mode to MSI mode and
-updates dev->irq to be the lowest of the new interrupts assigned to it.
-The other interrupts assigned to the device are in the range dev->irq
-to dev->irq + returned value - 1.  Device driver can use the returned
-number of successfully allocated MSI interrupts to further allocate
-and initialize device resources.
+  int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 
-If this function returns a negative number, it indicates an error and
-the driver should not attempt to request any more MSI interrupts for
-this device.
+Any allocated resources should be freed before removing the device using
+the following function:
 
-This function should be called before the driver calls request_irq(),
-because MSI interrupts are delivered via vectors that are different
-from the vector of a pin-based interrupt.
+  void pci_free_irq_vectors(struct pci_dev *dev);
 
-It is ideal if drivers can cope with a variable number of MSI interrupts;
-there are many reasons why the platform may not be able to provide the
-exact number that a driver asks for.
+If a device supports both MSI-X and MSI capabilities, this API will use the
+MSI-X facilities in preference to the MSI facilities.  MSI-X supports any
+number of interrupts between 1 and 2048.  In contrast, MSI is restricted to
+a maximum of 32 interrupts (and must be a power of two).  In addition, the
+MSI interrupt vectors must be allocated consecutively, so the system might
+not be able to allocate as many vectors for MSI as it could for MSI-X.  On
+some platforms, MSI interrupts must all be targeted at the same set of CPUs
+whereas MSI-X interrupts can all be targeted at different CPUs.
 
-There could be devices that can not operate with just any number of MSI
-interrupts within a range.  See chapter 4.3.1.3 to get the idea how to
-handle such devices for MSI-X - the same logic applies to MSI.
+If a device supports neither MSI-X or MSI it will fall back to a single
+legacy IRQ vector.
 
-4.2.1.1 Maximum possible number of MSI interrupts
+The typical usage of MSI or MSI-X interrupts is to allocate as many vectors
+as possible, likely up to the limit supported by the device.  If nvec is
+larger than the number supported by the device it will automatically be
+capped to the supported limit, so there is no need to query the number of
+vectors supported beforehand:
 
-The typical usage of MSI interrupts is to allocate as many vectors as
-possible, likely up to the limit returned by pci_msi_vec_count() function:
-
-static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec)
-{
-       return pci_enable_msi_range(pdev, 1, nvec);
-}
-
-Note the value of 'minvec' parameter is 1.  As 'minvec' is inclusive,
-the value of 0 would be meaningless and could result in error.
-
-Some devices have a minimal limit on number of MSI interrupts.
-In this case the function could look like this:
-
-static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec)
-{
-       return pci_enable_msi_range(pdev, FOO_DRIVER_MINIMUM_NVEC, nvec);
-}
-
-4.2.1.2 Exact number of MSI interrupts
+       nvec = pci_alloc_irq_vectors(pdev, 1, nvec, 0);
+       if (nvec < 0)
+               goto out_err;
 
 If a driver is unable or unwilling to deal with a variable number of MSI
-interrupts it could request a particular number of interrupts by passing
-that number to pci_enable_msi_range() function as both 'minvec' and 'maxvec'
-parameters:
-
-static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec)
-{
-       return pci_enable_msi_range(pdev, nvec, nvec);
-}
-
-Note, unlike pci_enable_msi_exact() function, which could be also used to
-enable a particular number of MSI-X interrupts, pci_enable_msi_range()
-returns either a negative errno or 'nvec' (not negative errno or 0 - as
-pci_enable_msi_exact() does).
-
-4.2.1.3 Single MSI mode
-
-The most notorious example of the request type described above is
-enabling the single MSI mode for a device.  It could be done by passing
-two 1s as 'minvec' and 'maxvec':
-
-static int foo_driver_enable_single_msi(struct pci_dev *pdev)
-{
-       return pci_enable_msi_range(pdev, 1, 1);
-}
-
-Note, unlike pci_enable_msi() function, which could be also used to
-enable the single MSI mode, pci_enable_msi_range() returns either a
-negative errno or 1 (not negative errno or 0 - as pci_enable_msi()
-does).
-
-4.2.3 pci_enable_msi_exact
-
-int pci_enable_msi_exact(struct pci_dev *dev, int nvec)
-
-This variation on pci_enable_msi_range() call allows a device driver to
-request exactly 'nvec' MSIs.
-
-If this function returns a negative number, it indicates an error and
-the driver should not attempt to request any more MSI interrupts for
-this device.
-
-By contrast with pci_enable_msi_range() function, pci_enable_msi_exact()
-returns zero in case of success, which indicates MSI interrupts have been
-successfully allocated.
-
-4.2.4 pci_disable_msi
-
-void pci_disable_msi(struct pci_dev *dev)
-
-This function should be used to undo the effect of pci_enable_msi_range().
-Calling it restores dev->irq to the pin-based interrupt number and frees
-the previously allocated MSIs.  The interrupts may subsequently be assigned
-to another device, so drivers should not cache the value of dev->irq.
-
-Before calling this function, a device driver must always call free_irq()
-on any interrupt for which it previously called request_irq().
-Failure to do so results in a BUG_ON(), leaving the device with
-MSI enabled and thus leaking its vector.
-
-4.2.4 pci_msi_vec_count
-
-int pci_msi_vec_count(struct pci_dev *dev)
-
-This function could be used to retrieve the number of MSI vectors the
-device requested (via the Multiple Message Capable register). The MSI
-specification only allows the returned value to be a power of two,
-up to a maximum of 2^5 (32).
-
-If this function returns a negative number, it indicates the device is
-not capable of sending MSIs.
-
-If this function returns a positive number, it indicates the maximum
-number of MSI interrupt vectors that could be allocated.
-
-4.3 Using MSI-X
-
-The MSI-X capability is much more flexible than the MSI capability.
-It supports up to 2048 interrupts, each of which can be controlled
-independently.  To support this flexibility, drivers must use an array of
-`struct msix_entry':
-
-struct msix_entry {
-       u16     vector; /* kernel uses to write alloc vector */
-       u16     entry; /* driver uses to specify entry */
-};
-
-This allows for the device to use these interrupts in a sparse fashion;
-for example, it could use interrupts 3 and 1027 and yet allocate only a
-two-element array.  The driver is expected to fill in the 'entry' value
-in each element of the array to indicate for which entries the kernel
-should assign interrupts; it is invalid to fill in two entries with the
-same number.
-
-4.3.1 pci_enable_msix_range
-
-int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
-                         int minvec, int maxvec)
-
-Calling this function asks the PCI subsystem to allocate any number of
-MSI-X interrupts within specified range from 'minvec' to 'maxvec'.
-The 'entries' argument is a pointer to an array of msix_entry structs
-which should be at least 'maxvec' entries in size.
-
-On success, the device is switched into MSI-X mode and the function
-returns the number of MSI-X interrupts that have been successfully
-allocated.  In this case the 'vector' member in entries numbered from
-0 to the returned value - 1 is populated with the interrupt number;
-the driver should then call request_irq() for each 'vector' that it
-decides to use.  The device driver is responsible for keeping track of the
-interrupts assigned to the MSI-X vectors so it can free them again later.
-Device driver can use the returned number of successfully allocated MSI-X
-interrupts to further allocate and initialize device resources.
-
-If this function returns a negative number, it indicates an error and
-the driver should not attempt to allocate any more MSI-X interrupts for
-this device.
-
-This function, in contrast with pci_enable_msi_range(), does not adjust
-dev->irq.  The device will not generate interrupts for this interrupt
-number once MSI-X is enabled.
-
-Device drivers should normally call this function once per device
-during the initialization phase.
-
-It is ideal if drivers can cope with a variable number of MSI-X interrupts;
-there are many reasons why the platform may not be able to provide the
-exact number that a driver asks for.
-
-There could be devices that can not operate with just any number of MSI-X
-interrupts within a range.  E.g., an network adapter might need let's say
-four vectors per each queue it provides.  Therefore, a number of MSI-X
-interrupts allocated should be a multiple of four.  In this case interface
-pci_enable_msix_range() can not be used alone to request MSI-X interrupts
-(since it can allocate any number within the range, without any notion of
-the multiple of four) and the device driver should master a custom logic
-to request the required number of MSI-X interrupts.
-
-4.3.1.1 Maximum possible number of MSI-X interrupts
-
-The typical usage of MSI-X interrupts is to allocate as many vectors as
-possible, likely up to the limit returned by pci_msix_vec_count() function:
-
-static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
-{
-       return pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
-                                    1, nvec);
-}
-
-Note the value of 'minvec' parameter is 1.  As 'minvec' is inclusive,
-the value of 0 would be meaningless and could result in error.
-
-Some devices have a minimal limit on number of MSI-X interrupts.
-In this case the function could look like this:
-
-static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
-{
-       return pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
-                                    FOO_DRIVER_MINIMUM_NVEC, nvec);
-}
-
-4.3.1.2 Exact number of MSI-X interrupts
-
-If a driver is unable or unwilling to deal with a variable number of MSI-X
-interrupts it could request a particular number of interrupts by passing
-that number to pci_enable_msix_range() function as both 'minvec' and 'maxvec'
-parameters:
-
-static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
-{
-       return pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
-                                    nvec, nvec);
-}
-
-Note, unlike pci_enable_msix_exact() function, which could be also used to
-enable a particular number of MSI-X interrupts, pci_enable_msix_range()
-returns either a negative errno or 'nvec' (not negative errno or 0 - as
-pci_enable_msix_exact() does).
-
-4.3.1.3 Specific requirements to the number of MSI-X interrupts
-
-As noted above, there could be devices that can not operate with just any
-number of MSI-X interrupts within a range.  E.g., let's assume a device that
-is only capable sending the number of MSI-X interrupts which is a power of
-two.  A routine that enables MSI-X mode for such device might look like this:
-
-/*
- * Assume 'minvec' and 'maxvec' are non-zero
- */
-static int foo_driver_enable_msix(struct foo_adapter *adapter,
-                                 int minvec, int maxvec)
-{
-       int rc;
-
-       minvec = roundup_pow_of_two(minvec);
-       maxvec = rounddown_pow_of_two(maxvec);
-
-       if (minvec > maxvec)
-               return -ERANGE;
-
-retry:
-       rc = pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
-                                  maxvec, maxvec);
-       /*
-        * -ENOSPC is the only error code allowed to be analyzed
-        */
-       if (rc == -ENOSPC) {
-               if (maxvec == 1)
-                       return -ENOSPC;
-
-               maxvec /= 2;
-
-               if (minvec > maxvec)
-                       return -ENOSPC;
-
-               goto retry;
-       }
-
-       return rc;
-}
-
-Note how pci_enable_msix_range() return value is analyzed for a fallback -
-any error code other than -ENOSPC indicates a fatal error and should not
-be retried.
-
-4.3.2 pci_enable_msix_exact
-
-int pci_enable_msix_exact(struct pci_dev *dev,
-                         struct msix_entry *entries, int nvec)
-
-This variation on pci_enable_msix_range() call allows a device driver to
-request exactly 'nvec' MSI-Xs.
-
-If this function returns a negative number, it indicates an error and
-the driver should not attempt to allocate any more MSI-X interrupts for
-this device.
-
-By contrast with pci_enable_msix_range() function, pci_enable_msix_exact()
-returns zero in case of success, which indicates MSI-X interrupts have been
-successfully allocated.
-
-Another version of a routine that enables MSI-X mode for a device with
-specific requirements described in chapter 4.3.1.3 might look like this:
-
-/*
- * Assume 'minvec' and 'maxvec' are non-zero
- */
-static int foo_driver_enable_msix(struct foo_adapter *adapter,
-                                 int minvec, int maxvec)
-{
-       int rc;
-
-       minvec = roundup_pow_of_two(minvec);
-       maxvec = rounddown_pow_of_two(maxvec);
-
-       if (minvec > maxvec)
-               return -ERANGE;
-
-retry:
-       rc = pci_enable_msix_exact(adapter->pdev,
-                                  adapter->msix_entries, maxvec);
-
-       /*
-        * -ENOSPC is the only error code allowed to be analyzed
-        */
-       if (rc == -ENOSPC) {
-               if (maxvec == 1)
-                       return -ENOSPC;
-
-               maxvec /= 2;
-
-               if (minvec > maxvec)
-                       return -ENOSPC;
-
-               goto retry;
-       } else if (rc < 0) {
-               return rc;
-       }
-
-       return maxvec;
-}
-
-4.3.3 pci_disable_msix
-
-void pci_disable_msix(struct pci_dev *dev)
-
-This function should be used to undo the effect of pci_enable_msix_range().
-It frees the previously allocated MSI-X interrupts. The interrupts may
-subsequently be assigned to another device, so drivers should not cache
-the value of the 'vector' elements over a call to pci_disable_msix().
-
-Before calling this function, a device driver must always call free_irq()
-on any interrupt for which it previously called request_irq().
-Failure to do so results in a BUG_ON(), leaving the device with
-MSI-X enabled and thus leaking its vector.
-
-4.3.3 The MSI-X Table
-
-The MSI-X capability specifies a BAR and offset within that BAR for the
-MSI-X Table.  This address is mapped by the PCI subsystem, and should not
-be accessed directly by the device driver.  If the driver wishes to
-mask or unmask an interrupt, it should call disable_irq() / enable_irq().
+interrupts it can request a particular number of interrupts by passing that
+number to pci_alloc_irq_vectors() function as both 'min_vecs' and
+'max_vecs' parameters:
 
-4.3.4 pci_msix_vec_count
+       ret = pci_alloc_irq_vectors(pdev, nvec, nvec, 0);
+       if (ret < 0)
+               goto out_err;
 
-int pci_msix_vec_count(struct pci_dev *dev)
+The most notorious example of the request type described above is enabling
+the single MSI mode for a device.  It could be done by passing two 1s as
+'min_vecs' and 'max_vecs':
 
-This function could be used to retrieve number of entries in the device
-MSI-X table.
+       ret = pci_alloc_irq_vectors(pdev, 1, 1, 0);
+       if (ret < 0)
+               goto out_err;
 
-If this function returns a negative number, it indicates the device is
-not capable of sending MSI-Xs.
+Some devices might not support using legacy line interrupts, in which case
+the PCI_IRQ_NOLEGACY flag can be used to fail the request if the platform
+can't provide MSI or MSI-X interrupts:
 
-If this function returns a positive number, it indicates the maximum
-number of MSI-X interrupt vectors that could be allocated.
+       nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_NOLEGACY);
+       if (nvec < 0)
+               goto out_err;
 
-4.4 Handling devices implementing both MSI and MSI-X capabilities
+4.3 Legacy APIs
 
-If a device implements both MSI and MSI-X capabilities, it can
-run in either MSI mode or MSI-X mode, but not both simultaneously.
-This is a requirement of the PCI spec, and it is enforced by the
-PCI layer.  Calling pci_enable_msi_range() when MSI-X is already
-enabled or pci_enable_msix_range() when MSI is already enabled
-results in an error.  If a device driver wishes to switch between MSI
-and MSI-X at runtime, it must first quiesce the device, then switch
-it back to pin-interrupt mode, before calling pci_enable_msi_range()
-or pci_enable_msix_range() and resuming operation.  This is not expected
-to be a common operation but may be useful for debugging or testing
-during development.
+The following old APIs to enable and disable MSI or MSI-X interrupts should
+not be used in new code:
 
-4.5 Considerations when using MSIs
+  pci_enable_msi()             /* deprecated */
+  pci_enable_msi_range()       /* deprecated */
+  pci_enable_msi_exact()       /* deprecated */
+  pci_disable_msi()            /* deprecated */
+  pci_enable_msix_range()      /* deprecated */
+  pci_enable_msix_exact()      /* deprecated */
+  pci_disable_msix()           /* deprecated */
 
-4.5.1 Choosing between MSI-X and MSI
+Additionally there are APIs to provide the number of supported MSI or MSI-X
+vectors: pci_msi_vec_count() and pci_msix_vec_count().  In general these
+should be avoided in favor of letting pci_alloc_irq_vectors() cap the
+number of vectors.  If you have a legitimate special use case for the count
+of vectors we might have to revisit that decision and add a
+pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently.
 
-If your device supports both MSI-X and MSI capabilities, you should use
-the MSI-X facilities in preference to the MSI facilities.  As mentioned
-above, MSI-X supports any number of interrupts between 1 and 2048.
-In contrast, MSI is restricted to a maximum of 32 interrupts (and
-must be a power of two).  In addition, the MSI interrupt vectors must
-be allocated consecutively, so the system might not be able to allocate
-as many vectors for MSI as it could for MSI-X.  On some platforms, MSI
-interrupts must all be targeted at the same set of CPUs whereas MSI-X
-interrupts can all be targeted at different CPUs.
+4.4 Considerations when using MSIs
 
-4.5.2 Spinlocks
+4.4.1 Spinlocks
 
 Most device drivers have a per-device spinlock which is taken in the
 interrupt handler.  With pin-based interrupts or a single MSI, it is not
@@ -505,7 +194,7 @@ acquire the spinlock.  Such deadlocks can be avoided by using
 spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
 and acquire the lock (see Documentation/DocBook/kernel-locking).
 
-4.6 How to tell whether MSI/MSI-X is enabled on a device
+4.5 How to tell whether MSI/MSI-X is enabled on a device
 
 Using 'lspci -v' (as root) may show some devices with "MSI", "Message
 Signalled Interrupts" or "MSI-X" capabilities.  Each of these capabilities
index 7f773d5..01fb1da 100644 (file)
@@ -38,6 +38,15 @@ as a regular user, and install it with
 
         sudo make install
 
+ Supplemental documentation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For supplemental documentation refer to the wiki:
+
+https://bottest.wiki.kernel.org/coccicheck
+
+The wiki documentation always refers to the linux-next version of the script.
+
  Using Coccinelle on the Linux kernel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -94,11 +103,26 @@ To enable verbose messages set the V= variable, for example:
 
    make coccicheck MODE=report V=1
 
+ Coccinelle parallelization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 By default, coccicheck tries to run as parallel as possible. To change
 the parallelism, set the J= variable. For example, to run across 4 CPUs:
 
    make coccicheck MODE=report J=4
 
+As of Coccinelle 1.0.2 Coccinelle uses Ocaml parmap for parallelization,
+if support for this is detected you will benefit from parmap parallelization.
+
+When parmap is enabled coccicheck will enable dynamic load balancing by using
+'--chunksize 1' argument, this ensures we keep feeding threads with work
+one by one, so that we avoid the situation where most work gets done by only
+a few threads. With dynamic load balancing, if a thread finishes early we keep
+feeding it more work.
+
+When parmap is enabled, if an error occurs in Coccinelle, this error
+value is propagated back, the return value of the 'make coccicheck'
+captures this return value.
 
  Using Coccinelle with a single semantic patch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -142,15 +166,118 @@ semantic patch as shown in the previous section.
 The "report" mode is the default. You can select another one with the
 MODE variable explained above.
 
+ Debugging Coccinelle SmPL patches
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Using coccicheck is best as it provides in the spatch command line
+include options matching the options used when we compile the kernel.
+You can learn what these options are by using V=1, you could then
+manually run Coccinelle with debug options added.
+
+Alternatively you can debug running Coccinelle against SmPL patches
+by asking for stderr to be redirected to stderr, by default stderr
+is redirected to /dev/null, if you'd like to capture stderr you
+can specify the DEBUG_FILE="file.txt" option to coccicheck. For
+instance:
+
+    rm -f cocci.err
+    make coccicheck COCCI=scripts/coccinelle/free/kfree.cocci MODE=report DEBUG_FILE=cocci.err
+    cat cocci.err
+
+You can use SPFLAGS to add debugging flags, for instance you may want to
+add both --profile --show-trying to SPFLAGS when debugging. For instance
+you may want to use:
+
+    rm -f err.log
+    export COCCI=scripts/coccinelle/misc/irqf_oneshot.cocci
+    make coccicheck DEBUG_FILE="err.log" MODE=report SPFLAGS="--profile --show-trying" M=./drivers/mfd/arizona-irq.c
+
+err.log will now have the profiling information, while stdout will
+provide some progress information as Coccinelle moves forward with
+work.
+
+DEBUG_FILE support is only supported when using coccinelle >= 1.2.
+
+ .cocciconfig support
+~~~~~~~~~~~~~~~~~~~~~~
+
+Coccinelle supports reading .cocciconfig for default Coccinelle options that
+should be used every time spatch is spawned, the order of precedence for
+variables for .cocciconfig is as follows:
+
+  o Your current user's home directory is processed first
+  o Your directory from which spatch is called is processed next
+  o The directory provided with the --dir option is processed last, if used
+
+Since coccicheck runs through make, it naturally runs from the kernel
+proper dir, as such the second rule above would be implied for picking up a
+.cocciconfig when using 'make coccicheck'.
+
+'make coccicheck' also supports using M= targets.If you do not supply
+any M= target, it is assumed you want to target the entire kernel.
+The kernel coccicheck script has:
+
+    if [ "$KBUILD_EXTMOD" = "" ] ; then
+        OPTIONS="--dir $srctree $COCCIINCLUDE"
+    else
+        OPTIONS="--dir $KBUILD_EXTMOD $COCCIINCLUDE"
+    fi
+
+KBUILD_EXTMOD is set when an explicit target with M= is used. For both cases
+the spatch --dir argument is used, as such third rule applies when whether M=
+is used or not, and when M= is used the target directory can have its own
+.cocciconfig file. When M= is not passed as an argument to coccicheck the
+target directory is the same as the directory from where spatch was called.
+
+If not using the kernel's coccicheck target, keep the above precedence
+order logic of .cocciconfig reading. If using the kernel's coccicheck target,
+override any of the kernel's .coccicheck's settings using SPFLAGS.
+
+We help Coccinelle when used against Linux with a set of sensible defaults
+options for Linux with our own Linux .cocciconfig. This hints to coccinelle
+git can be used for 'git grep' queries over coccigrep. A timeout of 200
+seconds should suffice for now.
+
+The options picked up by coccinelle when reading a .cocciconfig do not appear
+as arguments to spatch processes running on your system, to confirm what
+options will be used by Coccinelle run:
+
+      spatch --print-options-only
+
+You can override with your own preferred index option by using SPFLAGS. Take
+note that when there are conflicting options Coccinelle takes precedence for
+the last options passed. Using .cocciconfig is possible to use idutils, however
+given the order of precedence followed by Coccinelle, since the kernel now
+carries its own .cocciconfig, you will need to use SPFLAGS to use idutils if
+desired. See below section "Additional flags" for more details on how to use
+idutils.
+
  Additional flags
 ~~~~~~~~~~~~~~~~~~
 
 Additional flags can be passed to spatch through the SPFLAGS
-variable.
+variable. This works as Coccinelle respects the last flags
+given to it when options are in conflict.
 
     make SPFLAGS=--use-glimpse coccicheck
+
+Coccinelle supports idutils as well but requires coccinelle >= 1.0.6.
+When no ID file is specified coccinelle assumes your ID database file
+is in the file .id-utils.index on the top level of the kernel, coccinelle
+carries a script scripts/idutils_index.sh which creates the database with
+
+    mkid -i C --output .id-utils.index
+
+If you have another database filename you can also just symlink with this
+name.
+
     make SPFLAGS=--use-idutils coccicheck
 
+Alternatively you can specify the database filename explicitly, for
+instance:
+
+    make SPFLAGS="--use-idutils /full-path/to/ID" coccicheck
+
 See spatch --help to learn more about spatch options.
 
 Note that the '--use-glimpse' and '--use-idutils' options
@@ -159,6 +286,25 @@ thus active by default. However, by indexing the code with
 one of these tools, and according to the cocci file used,
 spatch could proceed the entire code base more quickly.
 
+ SmPL patch specific options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+SmPL patches can have their own requirements for options passed
+to Coccinelle. SmPL patch specific options can be provided by
+providing them at the top of the SmPL patch, for instance:
+
+// Options: --no-includes --include-headers
+
+ SmPL patch Coccinelle requirements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As Coccinelle features get added some more advanced SmPL patches
+may require newer versions of Coccinelle. If an SmPL patch requires
+at least a version of Coccinelle, this can be specified as follows,
+as an example if requiring at least Coccinelle >= 1.0.5:
+
+// Requires: 1.0.5
+
  Proposing new semantic patches
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
index 21055e2..c1359f4 100644 (file)
@@ -46,6 +46,10 @@ Required properties:
                        0 maps to GPMC_WAIT0 pin.
  - gpio-cells:         Must be set to 2
 
+Required properties when using NAND prefetch dma:
+ - dmas                        GPMC NAND prefetch dma channel
+ - dma-names           Must be set to "rxtx"
+
 Timing properties for child nodes. All are optional and default to 0.
 
  - gpmc,sync-clk-ps:   Minimum clock period for synchronous mode, in picoseconds
@@ -137,7 +141,8 @@ Example for an AM33xx board:
                ti,hwmods = "gpmc";
                reg = <0x50000000 0x2000>;
                interrupts = <100>;
-
+               dmas = <&edma 52 0>;
+               dma-names = "rxtx";
                gpmc,num-cs = <8>;
                gpmc,num-waitpins = <2>;
                #address-cells = <2>;
diff --git a/Documentation/devicetree/bindings/mtd/atmel-quadspi.txt b/Documentation/devicetree/bindings/mtd/atmel-quadspi.txt
new file mode 100644 (file)
index 0000000..4898070
--- /dev/null
@@ -0,0 +1,32 @@
+* Atmel Quad Serial Peripheral Interface (QSPI)
+
+Required properties:
+- compatible:     Should be "atmel,sama5d2-qspi".
+- reg:            Should contain the locations and lengths of the base registers
+                  and the mapped memory.
+- reg-names:      Should contain the resource reg names:
+                  - qspi_base: configuration register address space
+                  - qspi_mmap: memory mapped address space
+- interrupts:     Should contain the interrupt for the device.
+- clocks:         The phandle of the clock needed by the QSPI controller.
+- #address-cells: Should be <1>.
+- #size-cells:    Should be <0>.
+
+Example:
+
+spi@f0020000 {
+       compatible = "atmel,sama5d2-qspi";
+       reg = <0xf0020000 0x100>, <0xd0000000 0x8000000>;
+       reg-names = "qspi_base", "qspi_mmap";
+       interrupts = <52 IRQ_TYPE_LEVEL_HIGH 7>;
+       clocks = <&spi0_clk>;
+       #address-cells = <1>;
+       #size-cells = <0>;
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_spi0_default>;
+       status = "okay";
+
+       m25p80@0 {
+               ...
+       };
+};
index 7066597..b40f3a4 100644 (file)
@@ -27,6 +27,7 @@ Required properties:
                          brcm,brcmnand-v6.2
                          brcm,brcmnand-v7.0
                          brcm,brcmnand-v7.1
+                         brcm,brcmnand-v7.2
                          brcm,brcmnand
 - reg              : the register start and length for NAND register region.
                      (optional) Flash DMA register range (if present)
diff --git a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
new file mode 100644 (file)
index 0000000..f248056
--- /dev/null
@@ -0,0 +1,56 @@
+* Cadence Quad SPI controller
+
+Required properties:
+- compatible : Should be "cdns,qspi-nor".
+- reg : Contains two entries, each of which is a tuple consisting of a
+       physical address and length. The first entry is the address and
+       length of the controller register set. The second entry is the
+       address and length of the QSPI Controller data area.
+- interrupts : Unit interrupt specifier for the controller interrupt.
+- clocks : phandle to the Quad SPI clock.
+- cdns,fifo-depth : Size of the data FIFO in words.
+- cdns,fifo-width : Bus width of the data FIFO in bytes.
+- cdns,trigger-address : 32-bit indirect AHB trigger address.
+
+Optional properties:
+- cdns,is-decoded-cs : Flag to indicate whether decoder is used or not.
+
+Optional subnodes:
+Subnodes of the Cadence Quad SPI controller are spi slave nodes with additional
+custom properties:
+- cdns,read-delay : Delay for read capture logic, in clock cycles
+- cdns,tshsl-ns : Delay in nanoseconds for the length that the master
+                  mode chip select outputs are de-asserted between
+                 transactions.
+- cdns,tsd2d-ns : Delay in nanoseconds between one chip select being
+                  de-activated and the activation of another.
+- cdns,tchsh-ns : Delay in nanoseconds between last bit of current
+                  transaction and deasserting the device chip select
+                 (qspi_n_ss_out).
+- cdns,tslch-ns : Delay in nanoseconds between setting qspi_n_ss_out low
+                  and first bit transfer.
+
+Example:
+
+       qspi: spi@ff705000 {
+               compatible = "cdns,qspi-nor";
+               #address-cells = <1>;
+               #size-cells = <0>;
+               reg = <0xff705000 0x1000>,
+                     <0xffa00000 0x1000>;
+               interrupts = <0 151 4>;
+               clocks = <&qspi_clk>;
+               cdns,is-decoded-cs;
+               cdns,fifo-depth = <128>;
+               cdns,fifo-width = <4>;
+               cdns,trigger-address = <0x00000000>;
+
+               flash0: n25q00@0 {
+                       ...
+                       cdns,read-delay = <4>;
+                       cdns,tshsl-ns = <50>;
+                       cdns,tsd2d-ns = <50>;
+                       cdns,tchsh-ns = <4>;
+                       cdns,tslch-ns = <4>;
+               };
+       };
index 3ee7e20..174f68c 100644 (file)
@@ -39,7 +39,7 @@ Optional properties:
 
                "prefetch-polled"       Prefetch polled mode (default)
                "polled"                Polled mode, without prefetch
-               "prefetch-dma"          Prefetch enabled sDMA mode
+               "prefetch-dma"          Prefetch enabled DMA mode
                "prefetch-irq"          Prefetch enabled irq mode
 
  - elm_id:     <deprecated> use "ti,elm-id" instead
diff --git a/Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt b/Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt
new file mode 100644 (file)
index 0000000..7498152
--- /dev/null
@@ -0,0 +1,24 @@
+HiSilicon SPI-NOR Flash Controller
+
+Required properties:
+- compatible : Should be "hisilicon,fmc-spi-nor" and one of the following strings:
+               "hisilicon,hi3519-spi-nor"
+- address-cells : Should be 1.
+- size-cells : Should be 0.
+- reg : Offset and length of the register set for the controller device.
+- reg-names : Must include the following two entries: "control", "memory".
+- clocks : handle to spi-nor flash controller clock.
+
+Example:
+spi-nor-controller@10000000 {
+       compatible = "hisilicon,hi3519-spi-nor", "hisilicon,fmc-spi-nor";
+       #address-cells = <1>;
+       #size-cells = <0>;
+       reg = <0x10000000 0x1000>, <0x14000000 0x1000000>;
+       reg-names = "control", "memory";
+       clocks = <&clock HI3519_FMC_CLK>;
+       spi-nor@0 {
+               compatible = "jedec,spi-nor";
+               reg = <0>;
+       };
+};
diff --git a/Documentation/devicetree/bindings/mtd/mtk-nand.txt b/Documentation/devicetree/bindings/mtd/mtk-nand.txt
new file mode 100644 (file)
index 0000000..069c192
--- /dev/null
@@ -0,0 +1,160 @@
+MTK SoCs NAND FLASH controller (NFC) DT binding
+
+This file documents the device tree bindings for MTK SoCs NAND controllers.
+The functional split of the controller requires two drivers to operate:
+the nand controller interface driver and the ECC engine driver.
+
+The hardware description for both devices must be captured as device
+tree nodes.
+
+1) NFC NAND Controller Interface (NFI):
+=======================================
+
+The first part of NFC is NAND Controller Interface (NFI) HW.
+Required NFI properties:
+- compatible:                  Should be "mediatek,mtxxxx-nfc".
+- reg:                         Base physical address and size of NFI.
+- interrupts:                  Interrupts of NFI.
+- clocks:                      NFI required clocks.
+- clock-names:                 NFI clocks internal name.
+- status:                      Disabled default. Then set "okay" by platform.
+- ecc-engine:                  Required ECC Engine node.
+- #address-cells:              NAND chip index, should be 1.
+- #size-cells:                 Should be 0.
+
+Example:
+
+       nandc: nfi@1100d000 {
+               compatible = "mediatek,mt2701-nfc";
+               reg = <0 0x1100d000 0 0x1000>;
+               interrupts = <GIC_SPI 56 IRQ_TYPE_LEVEL_LOW>;
+               clocks = <&pericfg CLK_PERI_NFI>,
+                        <&pericfg CLK_PERI_NFI_PAD>;
+               clock-names = "nfi_clk", "pad_clk";
+               status = "disabled";
+               ecc-engine = <&bch>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+        };
+
+Platform related properties, should be set in {platform_name}.dts:
+- children nodes:      NAND chips.
+
+Children nodes properties:
+- reg:                 Chip Select Signal, default 0.
+                       Set as reg = <0>, <1> when need 2 CS.
+Optional:
+- nand-on-flash-bbt:   Store BBT on NAND Flash.
+- nand-ecc-mode:       the NAND ecc mode (check driver for supported modes)
+- nand-ecc-step-size:  Number of data bytes covered by a single ECC step.
+                       valid values: 512 and 1024.
+                       1024 is recommended for large page NANDs.
+- nand-ecc-strength:   Number of bits to correct per ECC step.
+                       The valid values that the controller supports are: 4, 6,
+                       8, 10, 12, 14, 16, 18, 20, 22, 24, 28, 32, 36, 40, 44,
+                       48, 52, 56, 60.
+                       The strength should be calculated as follows:
+                       E = (S - F) * 8 / 14
+                       S = O / (P / Q)
+                               E :     nand-ecc-strength.
+                               S :     spare size per sector.
+                               F :     FDM size, should be in the range [1,8].
+                                       It is used to store free oob data.
+                               O :     oob size.
+                               P :     page size.
+                               Q :     nand-ecc-step-size.
+                       If the result does not match any one of the listed
+                       choices above, please select the smaller valid value from
+                       the list.
+                       (otherwise the driver will do the adjustment at runtime)
+- pinctrl-names:       Default NAND pin GPIO setting name.
+- pinctrl-0:           GPIO setting node.
+
+Example:
+       &pio {
+               nand_pins_default: nanddefault {
+                       pins_dat {
+                               pinmux = <MT2701_PIN_111_MSDC0_DAT7__FUNC_NLD7>,
+                                        <MT2701_PIN_112_MSDC0_DAT6__FUNC_NLD6>,
+                                        <MT2701_PIN_114_MSDC0_DAT4__FUNC_NLD4>,
+                                        <MT2701_PIN_118_MSDC0_DAT3__FUNC_NLD3>,
+                                        <MT2701_PIN_121_MSDC0_DAT0__FUNC_NLD0>,
+                                        <MT2701_PIN_120_MSDC0_DAT1__FUNC_NLD1>,
+                                        <MT2701_PIN_113_MSDC0_DAT5__FUNC_NLD5>,
+                                        <MT2701_PIN_115_MSDC0_RSTB__FUNC_NLD8>,
+                                        <MT2701_PIN_119_MSDC0_DAT2__FUNC_NLD2>;
+                               input-enable;
+                               drive-strength = <MTK_DRIVE_8mA>;
+                               bias-pull-up;
+                       };
+
+                       pins_we {
+                               pinmux = <MT2701_PIN_117_MSDC0_CLK__FUNC_NWEB>;
+                               drive-strength = <MTK_DRIVE_8mA>;
+                               bias-pull-up = <MTK_PUPD_SET_R1R0_10>;
+                       };
+
+                       pins_ale {
+                               pinmux = <MT2701_PIN_116_MSDC0_CMD__FUNC_NALE>;
+                               drive-strength = <MTK_DRIVE_8mA>;
+                               bias-pull-down = <MTK_PUPD_SET_R1R0_10>;
+                       };
+               };
+       };
+
+       &nandc {
+               status = "okay";
+               pinctrl-names = "default";
+               pinctrl-0 = <&nand_pins_default>;
+               nand@0 {
+                       reg = <0>;
+                       nand-on-flash-bbt;
+                       nand-ecc-mode = "hw";
+                       nand-ecc-strength = <24>;
+                       nand-ecc-step-size = <1024>;
+               };
+       };
+
+NAND chip optional subnodes:
+- Partitions, see Documentation/devicetree/bindings/mtd/partition.txt
+
+Example:
+       nand@0 {
+               partitions {
+                       compatible = "fixed-partitions";
+                       #address-cells = <1>;
+                       #size-cells = <1>;
+
+                       preloader@0 {
+                               label = "pl";
+                               read-only;
+                               reg = <0x00000000 0x00400000>;
+                       };
+                       android@0x00400000 {
+                               label = "android";
+                               reg = <0x00400000 0x12c00000>;
+                       };
+               };
+       };
+
+2) ECC Engine:
+==============
+
+Required BCH properties:
+- compatible:  Should be "mediatek,mtxxxx-ecc".
+- reg:         Base physical address and size of ECC.
+- interrupts:  Interrupts of ECC.
+- clocks:      ECC required clocks.
+- clock-names: ECC clocks internal name.
+- status:      Disabled default. Then set "okay" by platform.
+
+Example:
+
+       bch: ecc@1100e000 {
+               compatible = "mediatek,mt2701-ecc";
+               reg = <0 0x1100e000 0 0x1000>;
+               interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_LOW>;
+               clocks = <&pericfg CLK_PERI_NFI_ECC>;
+               clock-names = "nfiecc_clk";
+               status = "disabled";
+       };
index 086d6f4..f322f56 100644 (file)
@@ -11,10 +11,16 @@ Required properties:
     * "ahb" : AHB gating clock
     * "mod" : nand controller clock
 
+Optional properties:
+- dmas : shall reference DMA channel associated to the NAND controller.
+- dma-names : shall be "rxtx".
+
 Optional children nodes:
 Children nodes represent the available nand chips.
 
 Optional properties:
+- reset : phandle + reset specifier pair
+- reset-names : must contain "ahb"
 - allwinner,rb : shall contain the native Ready/Busy ids.
  or
 - rb-gpios : shall contain the gpios used as R/B pins.
diff --git a/Documentation/devicetree/bindings/pci/aardvark-pci.txt b/Documentation/devicetree/bindings/pci/aardvark-pci.txt
new file mode 100644 (file)
index 0000000..bbcd9f4
--- /dev/null
@@ -0,0 +1,56 @@
+Aardvark PCIe controller
+
+This PCIe controller is used on the Marvell Armada 3700 ARM64 SoC.
+
+The Device Tree node describing an Aardvark PCIe controller must
+contain the following properties:
+
+ - compatible: Should be "marvell,armada-3700-pcie"
+ - reg: range of registers for the PCIe controller
+ - interrupts: the interrupt line of the PCIe controller
+ - #address-cells: set to <3>
+ - #size-cells: set to <2>
+ - device_type: set to "pci"
+ - ranges: ranges for the PCI memory and I/O regions
+ - #interrupt-cells: set to <1>
+ - msi-controller: indicates that the PCIe controller can itself
+   handle MSI interrupts
+ - msi-parent: pointer to the MSI controller to be used
+ - interrupt-map-mask and interrupt-map: standard PCI properties to
+   define the mapping of the PCIe interface to interrupt numbers.
+ - bus-range: PCI bus numbers covered
+
+In addition, the Device Tree describing an Aardvark PCIe controller
+must include a sub-node that describes the legacy interrupt controller
+built into the PCIe controller. This sub-node must have the following
+properties:
+
+ - interrupt-controller
+ - #interrupt-cells: set to <1>
+
+Example:
+
+       pcie0: pcie@d0070000 {
+               compatible = "marvell,armada-3700-pcie";
+               device_type = "pci";
+               status = "disabled";
+               reg = <0 0xd0070000 0 0x20000>;
+               #address-cells = <3>;
+               #size-cells = <2>;
+               bus-range = <0x00 0xff>;
+               interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>;
+               #interrupt-cells = <1>;
+               msi-controller;
+               msi-parent = <&pcie0>;
+               ranges = <0x82000000 0 0xe8000000   0 0xe8000000 0 0x1000000 /* Port 0 MEM */
+                         0x81000000 0 0xe9000000   0 0xe9000000 0 0x10000>; /* Port 0 IO*/
+               interrupt-map-mask = <0 0 0 7>;
+               interrupt-map = <0 0 0 1 &pcie_intc 0>,
+                               <0 0 0 2 &pcie_intc 1>,
+                               <0 0 0 3 &pcie_intc 2>,
+                               <0 0 0 4 &pcie_intc 3>;
+               pcie_intc: interrupt-controller {
+                       interrupt-controller;
+                       #interrupt-cells = <1>;
+               };
+       };
diff --git a/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt b/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt
new file mode 100644 (file)
index 0000000..330a45b
--- /dev/null
@@ -0,0 +1,46 @@
+* Axis ARTPEC-6 PCIe interface
+
+This PCIe host controller is based on the Synopsys DesignWare PCIe IP
+and thus inherits all the common properties defined in designware-pcie.txt.
+
+Required properties:
+- compatible: "axis,artpec6-pcie", "snps,dw-pcie"
+- reg: base addresses and lengths of the PCIe controller (DBI),
+       the phy controller, and configuration address space.
+- reg-names: Must include the following entries:
+       - "dbi"
+       - "phy"
+       - "config"
+- interrupts: A list of interrupt outputs of the controller. Must contain an
+  entry for each entry in the interrupt-names property.
+- interrupt-names: Must include the following entries:
+       - "msi": The interrupt that is asserted when an MSI is received
+- axis,syscon-pcie: A phandle pointing to the ARTPEC-6 system controller,
+       used to enable and control the Synopsys IP.
+
+Example:
+
+       pcie@f8050000 {
+               compatible = "axis,artpec6-pcie", "snps,dw-pcie";
+               reg = <0xf8050000 0x2000
+                      0xf8040000 0x1000
+                      0xc0000000 0x1000>;
+               reg-names = "dbi", "phy", "config";
+               #address-cells = <3>;
+               #size-cells = <2>;
+               device_type = "pci";
+                         /* downstream I/O */
+               ranges = <0x81000000 0 0x00010000 0xc0010000 0 0x00010000
+                         /* non-prefetchable memory */
+                         0x82000000 0 0xc0020000 0xc0020000 0 0x1ffe0000>;
+               num-lanes = <2>;
+               interrupts = <GIC_SPI 148 IRQ_TYPE_LEVEL_HIGH>;
+               interrupt-names = "msi";
+               #interrupt-cells = <1>;
+               interrupt-map-mask = <0 0 0 0x7>;
+               interrupt-map = <0 0 0 1 &intc GIC_SPI 144 IRQ_TYPE_LEVEL_HIGH>,
+                               <0 0 0 2 &intc GIC_SPI 145 IRQ_TYPE_LEVEL_HIGH>,
+                               <0 0 0 3 &intc GIC_SPI 146 IRQ_TYPE_LEVEL_HIGH>,
+                               <0 0 0 4 &intc GIC_SPI 147 IRQ_TYPE_LEVEL_HIGH>;
+               axis,syscon-pcie = <&syscon>;
+       };
index 8ea834f..5385cba 100644 (file)
@@ -3,6 +3,7 @@
 *.bc
 *.bin
 *.bz2
+*.c.[012]*.*
 *.cis
 *.cpio
 *.csp
index e1a0056..1dfdec7 100644 (file)
@@ -281,7 +281,7 @@ on the wait queue and one attempt is made to recycle them. Obviously,
 if the client-core stays dead too long, the arbitrary userspace processes
 trying to use Orangefs will be negatively affected. Waiting ops
 that can't be serviced will be removed from the request list and
-have their states set to "given up". In-progress ops that can't 
+have their states set to "given up". In-progress ops that can't
 be serviced will be removed from the in_progress hash table and
 have their states set to "given up".
 
@@ -338,7 +338,7 @@ particular response.
   PVFS2_VFS_OP_STATFS
     fill a pvfs2_statfs_response_t with useless info <g>. It is hard for
     us to know, in a timely fashion, these statistics about our
-    distributed network filesystem. 
+    distributed network filesystem.
 
   PVFS2_VFS_OP_FS_MOUNT
     fill a pvfs2_fs_mount_response_t which is just like a PVFS_object_kref
@@ -386,7 +386,7 @@ responses:
 
   io_array[1].iov_base = address of global variable "pdev_magic" (int32_t)
   io_array[1].iov_len = sizeof(int32_t)
-  
+
   io_array[2].iov_base = address of parameter "tag" (PVFS_id_gen_t)
   io_array[2].iov_len = sizeof(int64_t)
 
@@ -402,5 +402,47 @@ Readdir responses initialize the fifth element io_array like this:
   io_array[4].iov_len = contents of member trailer_size (PVFS_size)
                         from out_downcall member of global variable
                         vfs_request
-  
+
+Orangefs exploits the dcache in order to avoid sending redundant
+requests to userspace. We keep object inode attributes up-to-date with
+orangefs_inode_getattr. Orangefs_inode_getattr uses two arguments to
+help it decide whether or not to update an inode: "new" and "bypass".
+Orangefs keeps private data in an object's inode that includes a short
+timeout value, getattr_time, which allows any iteration of
+orangefs_inode_getattr to know how long it has been since the inode was
+updated. When the object is not new (new == 0) and the bypass flag is not
+set (bypass == 0) orangefs_inode_getattr returns without updating the inode
+if getattr_time has not timed out. Getattr_time is updated each time the
+inode is updated.
+
+Creation of a new object (file, dir, sym-link) includes the evaluation of
+its pathname, resulting in a negative directory entry for the object.
+A new inode is allocated and associated with the dentry, turning it from
+a negative dentry into a "productive full member of society". Orangefs
+obtains the new inode from Linux with new_inode() and associates
+the inode with the dentry by sending the pair back to Linux with
+d_instantiate().
+
+The evaluation of a pathname for an object resolves to its corresponding
+dentry. If there is no corresponding dentry, one is created for it in
+the dcache. Whenever a dentry is modified or verified Orangefs stores a
+short timeout value in the dentry's d_time, and the dentry will be trusted
+for that amount of time. Orangefs is a network filesystem, and objects
+can potentially change out-of-band with any particular Orangefs kernel module
+instance, so trusting a dentry is risky. The alternative to trusting
+dentries is to always obtain the needed information from userspace - at
+least a trip to the client-core, maybe to the servers. Obtaining information
+from a dentry is cheap, obtaining it from userspace is relatively expensive,
+hence the motivation to use the dentry when possible.
+
+The timeout values d_time and getattr_time are jiffy based, and the
+code is designed to avoid the jiffy-wrap problem:
+
+"In general, if the clock may have wrapped around more than once, there
+is no way to tell how much time has elapsed. However, if the times t1
+and t2 are known to be fairly close, we can reliably compute the
+difference in a way that takes into account the possibility that the
+clock may have wrapped between times."
+
+                      from course notes by instructor Andy Wang
 
diff --git a/Documentation/gcc-plugins.txt b/Documentation/gcc-plugins.txt
new file mode 100644 (file)
index 0000000..891c694
--- /dev/null
@@ -0,0 +1,87 @@
+GCC plugin infrastructure
+=========================
+
+
+1. Introduction
+===============
+
+GCC plugins are loadable modules that provide extra features to the
+compiler [1]. They are useful for runtime instrumentation and static analysis.
+We can analyse, change and add further code during compilation via
+callbacks [2], GIMPLE [3], IPA [4] and RTL passes [5].
+
+The GCC plugin infrastructure of the kernel supports all gcc versions from
+4.5 to 6.0, building out-of-tree modules, cross-compilation and building in a
+separate directory.
+Plugin source files have to be compilable by both a C and a C++ compiler as well
+because gcc versions 4.5 and 4.6 are compiled by a C compiler,
+gcc-4.7 can be compiled by a C or a C++ compiler,
+and versions 4.8+ can only be compiled by a C++ compiler.
+
+Currently the GCC plugin infrastructure supports only the x86, arm and arm64
+architectures.
+
+This infrastructure was ported from grsecurity [6] and PaX [7].
+
+--
+[1] https://gcc.gnu.org/onlinedocs/gccint/Plugins.html
+[2] https://gcc.gnu.org/onlinedocs/gccint/Plugin-API.html#Plugin-API
+[3] https://gcc.gnu.org/onlinedocs/gccint/GIMPLE.html
+[4] https://gcc.gnu.org/onlinedocs/gccint/IPA.html
+[5] https://gcc.gnu.org/onlinedocs/gccint/RTL.html
+[6] https://grsecurity.net/
+[7] https://pax.grsecurity.net/
+
+
+2. Files
+========
+
+$(src)/scripts/gcc-plugins
+       This is the directory of the GCC plugins.
+
+$(src)/scripts/gcc-plugins/gcc-common.h
+       This is a compatibility header for GCC plugins.
+       It should be always included instead of individual gcc headers.
+
+$(src)/scripts/gcc-plugin.sh
+       This script checks the availability of the included headers in
+       gcc-common.h and chooses the proper host compiler to build the plugins
+       (gcc-4.7 can be built by either gcc or g++).
+
+$(src)/scripts/gcc-plugins/gcc-generate-gimple-pass.h
+$(src)/scripts/gcc-plugins/gcc-generate-ipa-pass.h
+$(src)/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h
+$(src)/scripts/gcc-plugins/gcc-generate-rtl-pass.h
+       These headers automatically generate the registration structures for
+       GIMPLE, SIMPLE_IPA, IPA and RTL passes. They support all gcc versions
+       from 4.5 to 6.0.
+       They should be preferred to creating the structures by hand.
+
+
+3. Usage
+========
+
+You must install the gcc plugin headers for your gcc version,
+e.g., on Ubuntu for gcc-4.9:
+
+       apt-get install gcc-4.9-plugin-dev
+
+Enable a GCC plugin based feature in the kernel config:
+
+       CONFIG_GCC_PLUGIN_CYC_COMPLEXITY = y
+
+To compile only the plugin(s):
+
+       make gcc-plugins
+
+or just run the kernel make and compile the whole kernel with
+the cyclomatic complexity GCC plugin.
+
+
+4. How to add a new GCC plugin
+==============================
+
+The GCC plugins are in $(src)/scripts/gcc-plugins/. You can use a file or a directory
+here. It must be added to $(src)/scripts/gcc-plugins/Makefile,
+$(src)/scripts/Makefile.gcc-plugins and $(src)/arch/Kconfig.
+See the cyc_complexity_plugin.c (CONFIG_GCC_PLUGIN_CYC_COMPLEXITY) GCC plugin.
index b240540..00e4c2f 100644 (file)
@@ -3021,6 +3021,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                resource_alignment=
                                Format:
                                [<order of align>@][<domain>:]<bus>:<slot>.<func>[; ...]
+                               [<order of align>@]pci:<vendor>:<device>\
+                                               [:<subvendor>:<subdevice>][; ...]
                                Specifies alignment and device to reassign
                                aligned memory resources.
                                If <order of align> is not specified,
@@ -3039,6 +3041,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                hpmemsize=nn[KMG]       The fixed amount of bus space which is
                                reserved for hotplug bridge's memory window.
                                Default size is 2 megabytes.
+               hpbussize=nn    The minimum amount of additional bus numbers
+                               reserved for buses below a hotplug bridge.
+                               Default is 1.
                realloc=        Enable/disable reallocating PCI bridge resources
                                if allocations done by BIOS are too small to
                                accommodate resources required by all child
@@ -3070,6 +3075,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                compat  Treat PCIe ports as PCI-to-PCI bridges, disable the PCIe
                        ports driver.
 
+       pcie_port_pm=   [PCIE] PCIe port power management handling:
+               off     Disable power management of all PCIe ports
+               force   Forcibly enable power management of all PCIe ports
+
        pcie_pme=       [PCIE,PM] Native PCIe PME signaling options:
                nomsi   Do not use MSI for native PCIe PME signaling (this makes
                        all PCIe root ports use INTx for all services).
index a4482cc..5237e1b 100644 (file)
@@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi {
        __u32 pad;
 };
 
+On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
+feature of KVM_CAP_X2APIC_API capability is enabled.  If it is enabled,
+address_hi bits 31-8 provide bits 31-8 of the destination id.  Bits 7-0 of
+address_hi must be zero.
+
 struct kvm_irq_routing_s390_adapter {
        __u64 ind_addr;
        __u64 summary_addr;
@@ -1583,6 +1588,17 @@ struct kvm_lapic_state {
 Reads the Local APIC registers and copies them into the input argument.  The
 data format and layout are the same as documented in the architecture manual.
 
+If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is
+enabled, then the format of APIC_ID register depends on the APIC mode
+(reported by MSR_IA32_APICBASE) of its VCPU.  x2APIC stores APIC ID in
+the APIC_ID register (bytes 32-35).  xAPIC only allows an 8-bit APIC ID
+which is stored in bits 31-24 of the APIC register, or equivalently in
+byte 35 of struct kvm_lapic_state's regs field.  KVM_GET_LAPIC must then
+be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR.
+
+If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state
+always uses xAPIC format.
+
 
 4.58 KVM_SET_LAPIC
 
@@ -1600,6 +1616,10 @@ struct kvm_lapic_state {
 Copies the input argument into the Local APIC registers.  The data format
 and layout are the same as documented in the architecture manual.
 
+The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's
+regs field) depends on the state of the KVM_CAP_X2APIC_API capability.
+See the note in KVM_GET_LAPIC.
+
 
 4.59 KVM_IOEVENTFD
 
@@ -2032,6 +2052,12 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CONFIG5      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG7      | 32
   MIPS  | KVM_REG_MIPS_CP0_ERROREPC     | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH1    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH2    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH3    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH4    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH5    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH6    | 64
   MIPS  | KVM_REG_MIPS_COUNT_CTL        | 64
   MIPS  | KVM_REG_MIPS_COUNT_RESUME     | 64
   MIPS  | KVM_REG_MIPS_COUNT_HZ         | 64
@@ -2156,7 +2182,7 @@ after pausing the vcpu, but before it is resumed.
 4.71 KVM_SIGNAL_MSI
 
 Capability: KVM_CAP_SIGNAL_MSI
-Architectures: x86
+Architectures: x86 arm64
 Type: vm ioctl
 Parameters: struct kvm_msi (in)
 Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
@@ -2169,10 +2195,22 @@ struct kvm_msi {
        __u32 address_hi;
        __u32 data;
        __u32 flags;
-       __u8  pad[16];
+       __u32 devid;
+       __u8  pad[12];
 };
 
-No flags are defined so far. The corresponding field must be 0.
+flags: KVM_MSI_VALID_DEVID: devid contains a valid value
+devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier
+       for the device that wrote the MSI message.
+       For PCI, this is usually a BFD identifier in the lower 16 bits.
+
+The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide
+the device ID. If this capability is not set, userland cannot rely on
+the kernel to allow the KVM_MSI_VALID_DEVID flag being set.
+
+On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is
+enabled.  If it is enabled, address_hi bits 31-8 provide bits 31-8 of the
+destination id.  Bits 7-0 of address_hi must be zero.
 
 
 4.71 KVM_CREATE_PIT2
@@ -2520,6 +2558,7 @@ Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
   ENXIO:  The group or attribute is unknown/unsupported for this device
+          or hardware support is missing.
   EPERM:  The attribute cannot (currently) be accessed this way
           (e.g. read-only attribute, or attribute that only makes
           sense when the device is in a different state)
@@ -2547,6 +2586,7 @@ Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
   ENXIO:  The group or attribute is unknown/unsupported for this device
+          or hardware support is missing.
 
 Tests whether a device supports a particular attribute.  A successful
 return indicates the attribute is implemented.  It does not necessarily
@@ -3803,6 +3843,42 @@ Allows use of runtime-instrumentation introduced with zEC12 processor.
 Will return -EINVAL if the machine does not support runtime-instrumentation.
 Will return -EBUSY if a VCPU has already been created.
 
+7.7 KVM_CAP_X2APIC_API
+
+Architectures: x86
+Parameters: args[0] - features that should be enabled
+Returns: 0 on success, -EINVAL when args[0] contains invalid features
+
+Valid feature flags in args[0] are
+
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
+
+Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
+KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
+allowing the use of 32-bit APIC IDs.  See KVM_CAP_X2APIC_API in their
+respective sections.
+
+KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work
+in logical mode or with more than 255 VCPUs.  Otherwise, KVM treats 0xff
+as a broadcast even in x2APIC mode in order to support physical x2APIC
+without interrupt remapping.  This is undesirable in logical mode,
+where 0xff represents CPUs 0-7 in cluster 0.
+
+7.8 KVM_CAP_S390_USER_INSTR0
+
+Architectures: s390
+Parameters: none
+
+With this capability enabled, all illegal instructions 0x0000 (2 bytes) will
+be intercepted and forwarded to user space. User space can use this
+mechanism e.g. to realize 2-byte software breakpoints. The kernel will
+not inject an operating exception for these instructions, user space has
+to take care of that.
+
+This capability can be enabled dynamically even if VCPUs were already
+created and are running.
+
 8. Other capabilities.
 ----------------------
 
index 59541d4..89182f8 100644 (file)
@@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC)
 Device types supported:
   KVM_DEV_TYPE_ARM_VGIC_V2     ARM Generic Interrupt Controller v2.0
   KVM_DEV_TYPE_ARM_VGIC_V3     ARM Generic Interrupt Controller v3.0
+  KVM_DEV_TYPE_ARM_VGIC_ITS    ARM Interrupt Translation Service Controller
 
-Only one VGIC instance may be instantiated through either this API or the
-legacy KVM_CREATE_IRQCHIP api.  The created VGIC will act as the VM interrupt
-controller, requiring emulated user-space devices to inject interrupts to the
-VGIC instead of directly to CPUs.
+Only one VGIC instance of the V2/V3 types above may be instantiated through
+either this API or the legacy KVM_CREATE_IRQCHIP api.  The created VGIC will
+act as the VM interrupt controller, requiring emulated user-space devices to
+inject interrupts to the VGIC instead of directly to CPUs.
 
 Creating a guest GICv3 device requires a host GICv3 as well.
 GICv3 implementations with hardware compatibility support allow a guest GICv2
 as well.
 
+Creating a virtual ITS controller requires a host GICv3 (but does not depend
+on having physical ITS controllers).
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
 Groups:
   KVM_DEV_ARM_VGIC_GRP_ADDR
   Attributes:
@@ -39,6 +45,13 @@ Groups:
       Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
       This address needs to be 64K aligned.
 
+    KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3 ITS
+      control register frame. The ITS allows MSI(-X) interrupts to be
+      injected into guests. This extension is optional. If the kernel
+      does not support the ITS, the call returns -ENODEV.
+      Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
+      This address needs to be 64K aligned and the region covers 128K.
 
   KVM_DEV_ARM_VGIC_GRP_DIST_REGS
   Attributes:
@@ -109,8 +122,8 @@ Groups:
   KVM_DEV_ARM_VGIC_GRP_CTRL
   Attributes:
     KVM_DEV_ARM_VGIC_CTRL_INIT
-      request the initialization of the VGIC, no additional parameter in
-      kvm_device_attr.addr.
+      request the initialization of the VGIC or ITS, no additional parameter
+      in kvm_device_attr.addr.
   Errors:
     -ENXIO: VGIC not properly configured as required prior to calling
      this attribute
index a9ea877..b6cda49 100644 (file)
@@ -20,7 +20,8 @@ Enables Collaborative Memory Management Assist (CMMA) for the virtual machine.
 
 1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA
 Parameters: none
-Returns: 0
+Returns: -EINVAL if CMMA was not enabled
+         0 otherwise
 
 Clear the CMMA status for all guest pages, so any pages the guest marked
 as unused are again used any may not be reclaimed by the host.
@@ -85,6 +86,90 @@ Returns:    -EBUSY in case 1 or more vcpus are already activated (only in write
            -ENOMEM if not enough memory is available to process the ioctl
            0 in case of success
 
+2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o)
+
+Allows user space to retrieve available cpu features. A feature is available if
+provided by the hardware and supported by kvm. In theory, cpu features could
+even be completely emulated by kvm.
+
+struct kvm_s390_vm_cpu_feat {
+        __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering
+};
+
+Parameters: address of a buffer to load the feature list from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+           0 in case of success.
+
+2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w)
+
+Allows user space to retrieve or change enabled cpu features for all VCPUs of a
+VM. Features that are not available cannot be enabled.
+
+See 2.3. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the feature list from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+           -EINVAL if a cpu feature that is not available is to be enabled.
+           -EBUSY if at least one VCPU has already been defined.
+           0 in case of success.
+
+2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o)
+
+Allows user space to retrieve available cpu subfunctions without any filtering
+done by a set IBC. These subfunctions are indicated to the guest VCPU via
+query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff.
+
+A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the
+STFL(E) bit introducing the affected instruction. If the affected instruction
+indicates subfunctions via a "query subfunction", the response block is
+contained in the returned struct. If the affected instruction
+indicates subfunctions via a "test bit" mechanism, the subfunction codes are
+contained in the returned struct in MSB 0 bit numbering.
+
+struct kvm_s390_vm_cpu_subfunc {
+       u8 plo[32];           # always valid (ESA/390 feature)
+       u8 ptff[16];          # valid with TOD-clock steering
+       u8 kmac[16];          # valid with Message-Security-Assist
+       u8 kmc[16];           # valid with Message-Security-Assist
+       u8 km[16];            # valid with Message-Security-Assist
+       u8 kimd[16];          # valid with Message-Security-Assist
+       u8 klmd[16];          # valid with Message-Security-Assist
+       u8 pckmo[16];         # valid with Message-Security-Assist-Extension 3
+       u8 kmctr[16];         # valid with Message-Security-Assist-Extension 4
+       u8 kmf[16];           # valid with Message-Security-Assist-Extension 4
+       u8 kmo[16];           # valid with Message-Security-Assist-Extension 4
+       u8 pcc[16];           # valid with Message-Security-Assist-Extension 4
+       u8 ppno[16];          # valid with Message-Security-Assist-Extension 5
+       u8 reserved[1824];    # reserved for future instructions
+};
+
+Parameters: address of a buffer to load the subfunction blocks from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+           0 in case of success.
+
+2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w)
+
+Allows user space to retrieve or change cpu subfunctions to be indicated for
+all VCPUs of a VM. This attribute will only be available if kernel and
+hardware support are in place.
+
+The kernel uses the configured subfunction blocks for indication to
+the guest. A subfunction block will only be used if the associated STFL(E) bit
+has not been disabled by user space (so the instruction to be queried is
+actually available for the guest).
+
+As long as no data has been written, a read will fail. The IBC will be used
+to determine available subfunctions in this case, this will guarantee backward
+compatibility.
+
+See 2.5. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the subfunction blocks from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+           -EINVAL when reading, if there was no write yet.
+           -EBUSY if at least one VCPU has already been defined.
+           0 in case of success.
+
 3. GROUP: KVM_S390_VM_TOD
 Architectures: s390
 
index 19f94a6..f2491a8 100644 (file)
@@ -89,7 +89,7 @@ In mmu_spte_clear_track_bits():
    old_spte = *spte;
 
    /* 'if' condition is satisfied. */
-   if (old_spte.Accssed == 1 &&
+   if (old_spte.Accessed == 1 &&
         old_spte.W == 0)
       spte = 0ull;
                                          on fast page fault path:
@@ -102,7 +102,7 @@ In mmu_spte_clear_track_bits():
       old_spte = xchg(spte, 0ull)
 
 
-   if (old_spte.Accssed == 1)
+   if (old_spte.Accessed == 1)
       kvm_set_pfn_accessed(spte.pfn);
    if (old_spte.Dirty == 1)
       kvm_set_pfn_dirty(spte.pfn);
index ce38536..429fc61 100644 (file)
@@ -5102,6 +5102,15 @@ L:       linux-scsi@vger.kernel.org
 S:     Odd Fixes (e.g., new signatures)
 F:     drivers/scsi/fdomain.*
 
+GCC PLUGINS
+M:     Kees Cook <keescook@chromium.org>
+R:     Emese Revfy <re.emese@gmail.com>
+L:     kernel-hardening@lists.openwall.com
+S:     Maintained
+F:     scripts/gcc-plugins/
+F:     scripts/gcc-plugin.sh
+F:     Documentation/gcc-plugins.txt
+
 GCOV BASED KERNEL PROFILING
 M:     Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
 S:     Maintained
@@ -8883,6 +8892,7 @@ L:        linux-pci@vger.kernel.org
 Q:     http://patchwork.ozlabs.org/project/linux-pci/list/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git
 S:     Supported
+F:     Documentation/devicetree/bindings/pci/
 F:     Documentation/PCI/
 F:     drivers/pci/
 F:     include/linux/pci*
@@ -8946,6 +8956,13 @@ L:       linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     drivers/pci/host/*mvebu*
 
+PCI DRIVER FOR AARDVARK (Marvell Armada 3700)
+M:     Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+L:     linux-pci@vger.kernel.org
+L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+S:     Maintained
+F:     drivers/pci/host/pci-aardvark.c
+
 PCI DRIVER FOR NVIDIA TEGRA
 M:     Thierry Reding <thierry.reding@gmail.com>
 L:     linux-tegra@vger.kernel.org
@@ -9028,6 +9045,15 @@ S:       Maintained
 F:     Documentation/devicetree/bindings/pci/xgene-pci-msi.txt
 F:     drivers/pci/host/pci-xgene-msi.c
 
+PCIE DRIVER FOR AXIS ARTPEC
+M:     Niklas Cassel <niklas.cassel@axis.com>
+M:     Jesper Nilsson <jesper.nilsson@axis.com>
+L:     linux-arm-kernel@axis.com
+L:     linux-pci@vger.kernel.org
+S:     Maintained
+F:     Documentation/devicetree/bindings/pci/axis,artpec*
+F:     drivers/pci/host/*artpec*
+
 PCIE DRIVER FOR HISILICON
 M:     Zhou Wang <wangzhou1@hisilicon.com>
 M:     Gabriele Paoloni <gabriele.paoloni@huawei.com>
index 393b615..d6d401b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -371,26 +371,27 @@ CFLAGS_KERNEL     =
 AFLAGS_KERNEL  =
 LDFLAGS_vmlinux =
 CFLAGS_GCOV    = -fprofile-arcs -ftest-coverage -fno-tree-loop-im
-CFLAGS_KCOV    = -fsanitize-coverage=trace-pc
+CFLAGS_KCOV    := $(call cc-option,-fsanitize-coverage=trace-pc,)
 
 
 # Use USERINCLUDE when you must reference the UAPI directories only.
 USERINCLUDE    := \
                -I$(srctree)/arch/$(hdr-arch)/include/uapi \
-               -Iarch/$(hdr-arch)/include/generated/uapi \
+               -I$(objtree)/arch/$(hdr-arch)/include/generated/uapi \
                -I$(srctree)/include/uapi \
-               -Iinclude/generated/uapi \
+               -I$(objtree)/include/generated/uapi \
                 -include $(srctree)/include/linux/kconfig.h
 
 # Use LINUXINCLUDE when you must reference the include/ directory.
 # Needed to be compatible with the O= option
 LINUXINCLUDE    := \
                -I$(srctree)/arch/$(hdr-arch)/include \
-               -Iarch/$(hdr-arch)/include/generated/uapi \
-               -Iarch/$(hdr-arch)/include/generated \
+               -I$(objtree)/arch/$(hdr-arch)/include/generated/uapi \
+               -I$(objtree)/arch/$(hdr-arch)/include/generated \
                $(if $(KBUILD_SRC), -I$(srctree)/include) \
-               -Iinclude \
-               $(USERINCLUDE)
+               -I$(objtree)/include
+
+LINUXINCLUDE   += $(filter-out $(LINUXINCLUDE),$(USERINCLUDE))
 
 KBUILD_CPPFLAGS := -D__KERNEL__
 
@@ -554,7 +555,7 @@ ifeq ($(KBUILD_EXTMOD),)
 # in parallel
 PHONY += scripts
 scripts: scripts_basic include/config/auto.conf include/config/tristate.conf \
-        asm-generic
+        asm-generic gcc-plugins
        $(Q)$(MAKE) $(build)=$(@)
 
 # Objects we will link into vmlinux / subdirs we need to visit
@@ -635,6 +636,15 @@ endif
 # Tell gcc to never replace conditional load with a non-conditional one
 KBUILD_CFLAGS  += $(call cc-option,--param=allow-store-data-races=0)
 
+PHONY += gcc-plugins
+gcc-plugins: scripts_basic
+ifdef CONFIG_GCC_PLUGINS
+       $(Q)$(MAKE) $(build)=scripts/gcc-plugins
+endif
+       @:
+
+include scripts/Makefile.gcc-plugins
+
 ifdef CONFIG_READABLE_ASM
 # Disable optimizations that make assembler listings hard to read.
 # reorder blocks reorders the control in the function
@@ -666,21 +676,11 @@ endif
 endif
 # Find arch-specific stack protector compiler sanity-checking script.
 ifdef CONFIG_CC_STACKPROTECTOR
-  stackp-path := $(srctree)/scripts/gcc-$(ARCH)_$(BITS)-has-stack-protector.sh
-  ifneq ($(wildcard $(stackp-path)),)
-    stackp-check := $(stackp-path)
-  endif
+  stackp-path := $(srctree)/scripts/gcc-$(SRCARCH)_$(BITS)-has-stack-protector.sh
+  stackp-check := $(wildcard $(stackp-path))
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 
-ifdef CONFIG_KCOV
-  ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
-    $(warning Cannot use CONFIG_KCOV: \
-             -fsanitize-coverage=trace-pc is not supported by compiler)
-    CFLAGS_KCOV =
-  endif
-endif
-
 ifeq ($(cc-name),clang)
 KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
 KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
@@ -1019,7 +1019,7 @@ prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
 
 archprepare: archheaders archscripts prepare1 scripts_basic
 
-prepare0: archprepare
+prepare0: archprepare gcc-plugins
        $(Q)$(MAKE) $(build)=.
 
 # All the preparing..
@@ -1531,6 +1531,7 @@ clean: $(clean-dirs)
                -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
                -o -name '*.symtypes' -o -name 'modules.order' \
                -o -name modules.builtin -o -name '.tmp_*.o.*' \
+               -o -name '*.c.[012]*.*' \
                -o -name '*.gcno' \) -type f -print | xargs rm -f
 
 # Generate tags for editors
@@ -1641,7 +1642,7 @@ endif
        $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
        $(build)=$(build-dir)
 # Make sure the latest headers are built for Documentation
-Documentation/: headers_install
+Documentation/ samples/: headers_install
 %/: prepare scripts FORCE
        $(cmd_crmodverdir)
        $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
index 1599629..bd8056b 100644 (file)
@@ -357,6 +357,43 @@ config SECCOMP_FILTER
 
          See Documentation/prctl/seccomp_filter.txt for details.
 
+config HAVE_GCC_PLUGINS
+       bool
+       help
+         An arch should select this symbol if it supports building with
+         GCC plugins.
+
+menuconfig GCC_PLUGINS
+       bool "GCC plugins"
+       depends on HAVE_GCC_PLUGINS
+       depends on !COMPILE_TEST
+       help
+         GCC plugins are loadable modules that provide extra features to the
+         compiler. They are useful for runtime instrumentation and static analysis.
+
+         See Documentation/gcc-plugins.txt for details.
+
+config GCC_PLUGIN_CYC_COMPLEXITY
+       bool "Compute the cyclomatic complexity of a function"
+       depends on GCC_PLUGINS
+       help
+         The complexity M of a function's control flow graph is defined as:
+          M = E - N + 2P
+         where
+
+         E = the number of edges
+         N = the number of nodes
+         P = the number of connected components (exit nodes).
+
+config GCC_PLUGIN_SANCOV
+       bool
+       depends on GCC_PLUGINS
+       help
+         This plugin inserts a __sanitizer_cov_trace_pc() call at the start of
+         basic blocks. It supports all gcc versions with plugin support (from
+         gcc-4.5 on). It is based on the commit "Add fuzzing coverage support"
+         by Dmitry Vyukov <dvyukov@google.com>.
+
 config HAVE_CC_STACKPROTECTOR
        bool
        help
index 8399bd0..0cbe4c5 100644 (file)
@@ -15,7 +15,7 @@ targets               := vmlinux.gz vmlinux \
 OBJSTRIP       := $(obj)/tools/objstrip
 
 HOSTCFLAGS     := -Wall -I$(objtree)/usr/include
-BOOTCFLAGS     += -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS     += -I$(objtree)/$(obj) -I$(srctree)/$(obj)
 
 # SRM bootable image.  Copy to offset 512 of a partition.
 $(obj)/bootimage: $(addprefix $(obj)/tools/,mkbb lxboot bootlx) $(obj)/vmlinux.nh
index 14b4cf7..2d601d7 100644 (file)
@@ -54,6 +54,7 @@ config ARM
        select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
        select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
        select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
+       select HAVE_GCC_PLUGINS
        select HAVE_GENERIC_DMA_COHERENT
        select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7))
        select HAVE_IDE if PCI || ISA || PCMCIA
@@ -699,7 +700,7 @@ config ARCH_VIRT
        depends on ARCH_MULTI_V7
        select ARM_AMBA
        select ARM_GIC
-       select ARM_GIC_V2M if PCI_MSI
+       select ARM_GIC_V2M if PCI
        select ARM_GIC_V3
        select ARM_PSCI
        select HAVE_ARM_ARCH_TIMER
index 3d5a5cd..58faff5 100644 (file)
@@ -66,6 +66,8 @@ extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern void __init_stage2_translation(void);
+
+extern void __kvm_hyp_reset(unsigned long);
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
index 96387d4..de338d9 100644 (file)
@@ -241,8 +241,7 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
                int exception_index);
 
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
-                                      phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
                                       unsigned long hyp_stack_ptr,
                                       unsigned long vector_ptr)
 {
@@ -251,18 +250,13 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
         * code. The init code doesn't need to preserve these
         * registers as r0-r3 are already callee saved according to
         * the AAPCS.
-        * Note that we slightly misuse the prototype by casing the
+        * Note that we slightly misuse the prototype by casting the
         * stack pointer to a void *.
-        *
-        * We don't have enough registers to perform the full init in
-        * one go.  Install the boot PGD first, and then install the
-        * runtime PGD, stack pointer and vectors. The PGDs are always
-        * passed as the third argument, in order to be passed into
-        * r2-r3 to the init code (yes, this is compliant with the
-        * PCS!).
-        */
 
-       kvm_call_hyp(NULL, 0, boot_pgd_ptr);
+        * The PGDs are always passed as the third argument, in order
+        * to be passed into r2-r3 to the init code (yes, this is
+        * compliant with the PCS!).
+        */
 
        kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
 }
@@ -272,16 +266,13 @@ static inline void __cpu_init_stage2(void)
        kvm_call_hyp(__init_stage2_translation);
 }
 
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
                                        phys_addr_t phys_idmap_start)
 {
-       /*
-        * TODO
-        * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
-        */
+       kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr);
 }
 
-static inline int kvm_arch_dev_ioctl_check_extension(long ext)
+static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 {
        return 0;
 }
index f0e8607..6eaff28 100644 (file)
@@ -25,9 +25,6 @@
 
 #define __hyp_text __section(.hyp.text) notrace
 
-#define kern_hyp_va(v) (v)
-#define hyp_kern_va(v) (v)
-
 #define __ACCESS_CP15(CRn, Op1, CRm, Op2)      \
        "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
 #define __ACCESS_CP15_64(Op1, CRm)             \
index f9a6506..3bb803d 100644 (file)
  * We directly use the kernel VA for the HYP, as we can directly share
  * the mapping (HTTBR "covers" TTBR1).
  */
-#define HYP_PAGE_OFFSET_MASK   UL(~0)
-#define HYP_PAGE_OFFSET                PAGE_OFFSET
-#define KERN_TO_HYP(kva)       (kva)
-
-/*
- * Our virtual mapping for the boot-time MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the vectors
- * page, where no kernel data will ever be shared with HYP.
- */
-#define TRAMPOLINE_VA          UL(CONFIG_VECTORS_BASE)
+#define kern_hyp_va(kva)       (kva)
 
 /*
  * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
@@ -49,9 +40,8 @@
 #include <asm/pgalloc.h>
 #include <asm/stage2_pgtable.h>
 
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
 
 void stage2_unmap_vm(struct kvm *kvm);
@@ -65,7 +55,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
index 0070e85..2d88af5 100644 (file)
@@ -22,6 +22,7 @@ struct hw_pci {
        struct msi_controller *msi_ctrl;
        struct pci_ops  *ops;
        int             nr_controllers;
+       unsigned int    io_optional:1;
        void            **private_data;
        int             (*setup)(int nr, struct pci_sys_data *);
        struct pci_bus *(*scan)(int nr, struct pci_sys_data *);
index d622040..a8d656d 100644 (file)
@@ -97,7 +97,9 @@ extern pgprot_t               pgprot_s2_device;
 #define PAGE_READONLY_EXEC     _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY)
 #define PAGE_KERNEL            _MOD_PROT(pgprot_kernel, L_PTE_XN)
 #define PAGE_KERNEL_EXEC       pgprot_kernel
-#define PAGE_HYP               _MOD_PROT(pgprot_kernel, L_PTE_HYP)
+#define PAGE_HYP               _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN)
+#define PAGE_HYP_EXEC          _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
+#define PAGE_HYP_RO            _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
 #define PAGE_HYP_DEVICE                _MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
 #define PAGE_S2                        _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
 #define PAGE_S2_DEVICE         _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY)
index d4ceaf5..a2e75b8 100644 (file)
@@ -80,6 +80,10 @@ static inline bool is_kernel_in_hyp_mode(void)
        return false;
 }
 
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
 /* The section containing the hypervisor text */
 extern char __hyp_text_start[];
 extern char __hyp_text_end[];
index 05e61a2..2f0e077 100644 (file)
@@ -410,7 +410,8 @@ static int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
        return irq;
 }
 
-static int pcibios_init_resources(int busnr, struct pci_sys_data *sys)
+static int pcibios_init_resource(int busnr, struct pci_sys_data *sys,
+                                int io_optional)
 {
        int ret;
        struct resource_entry *window;
@@ -420,6 +421,14 @@ static int pcibios_init_resources(int busnr, struct pci_sys_data *sys)
                         &iomem_resource, sys->mem_offset);
        }
 
+       /*
+        * If a platform says I/O port support is optional, we don't add
+        * the default I/O space.  The platform is responsible for adding
+        * any I/O space it needs.
+        */
+       if (io_optional)
+               return 0;
+
        resource_list_for_each_entry(window, &sys->resources)
                if (resource_type(window->res) == IORESOURCE_IO)
                        return 0;
@@ -466,7 +475,7 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw,
                if (ret > 0) {
                        struct pci_host_bridge *host_bridge;
 
-                       ret = pcibios_init_resources(nr, sys);
+                       ret = pcibios_init_resource(nr, sys, hw->io_optional);
                        if (ret)  {
                                kfree(sys);
                                break;
@@ -515,25 +524,23 @@ void pci_common_init_dev(struct device *parent, struct hw_pci *hw)
        list_for_each_entry(sys, &head, node) {
                struct pci_bus *bus = sys->bus;
 
-               if (!pci_has_flag(PCI_PROBE_ONLY)) {
+               /*
+                * We insert PCI resources into the iomem_resource and
+                * ioport_resource trees in either pci_bus_claim_resources()
+                * or pci_bus_assign_resources().
+                */
+               if (pci_has_flag(PCI_PROBE_ONLY)) {
+                       pci_bus_claim_resources(bus);
+               } else {
                        struct pci_bus *child;
 
-                       /*
-                        * Size the bridge windows.
-                        */
                        pci_bus_size_bridges(bus);
-
-                       /*
-                        * Assign resources.
-                        */
                        pci_bus_assign_resources(bus);
 
                        list_for_each_entry(child, &bus->children, node)
                                pcie_bus_configure_settings(child);
                }
-               /*
-                * Tell drivers about devices found.
-                */
+
                pci_bus_add_devices(bus);
        }
 }
@@ -590,18 +597,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
        return start;
 }
 
-/**
- * pcibios_enable_device - Enable I/O and memory.
- * @dev: PCI device to be enabled
- */
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-       if (pci_has_flag(PCI_PROBE_ONLY))
-               return 0;
-
-       return pci_enable_resources(dev, mask);
-}
-
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                        enum pci_mmap_state mmap_state, int write_combine)
 {
index 02abfff..95a0005 100644 (file)
@@ -46,13 +46,6 @@ config KVM_ARM_HOST
        ---help---
          Provides host support for ARM processors.
 
-config KVM_NEW_VGIC
-       bool "New VGIC implementation"
-       depends on KVM
-       default y
-       ---help---
-         uses the new VGIC implementation
-
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index a596b58..5e28df8 100644 (file)
@@ -22,7 +22,6 @@ obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
 
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
 obj-y += $(KVM)/arm/vgic/vgic.o
 obj-y += $(KVM)/arm/vgic/vgic-init.o
 obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,9 +29,4 @@ obj-y += $(KVM)/arm/vgic/vgic-v2.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
 obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-obj-y += $(KVM)/arm/vgic.o
-obj-y += $(KVM)/arm/vgic-v2.o
-obj-y += $(KVM)/arm/vgic-v2-emul.o
-endif
 obj-y += $(KVM)/arm/arch_timer.o
index f1bde7c..d94bb90 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/list.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
@@ -122,7 +123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (ret)
                goto out_fail_alloc;
 
-       ret = create_hyp_mappings(kvm, kvm + 1);
+       ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
        if (ret)
                goto out_free_stage2_pgd;
 
@@ -201,7 +202,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = KVM_MAX_VCPUS;
                break;
        default:
-               r = kvm_arch_dev_ioctl_check_extension(ext);
+               r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
                break;
        }
        return r;
@@ -239,7 +240,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        if (err)
                goto free_vcpu;
 
-       err = create_hyp_mappings(vcpu, vcpu + 1);
+       err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
        if (err)
                goto vcpu_uninit;
 
@@ -377,7 +378,7 @@ void force_vm_exit(const cpumask_t *mask)
 
 /**
  * need_new_vmid_gen - check that the VMID is still valid
- * @kvm: The VM's VMID to checkt
+ * @kvm: The VM's VMID to check
  *
  * return true if there is a new generation of VMIDs being used
  *
@@ -616,7 +617,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 * Enter the guest
                 */
                trace_kvm_entry(*vcpu_pc(vcpu));
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                vcpu->mode = IN_GUEST_MODE;
 
                ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
@@ -642,14 +643,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                local_irq_enable();
 
                /*
-                * We do local_irq_enable() before calling kvm_guest_exit() so
+                * We do local_irq_enable() before calling guest_exit() so
                 * that if a timer interrupt hits while running the guest we
                 * account that tick as being spent in the guest.  We enable
-                * preemption after calling kvm_guest_exit() so that if we get
+                * preemption after calling guest_exit() so that if we get
                 * preempted we make sure ticks after that is not counted as
                 * guest time.
                 */
-               kvm_guest_exit();
+               guest_exit();
                trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 
                /*
@@ -1039,7 +1040,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 static void cpu_init_hyp_mode(void *dummy)
 {
-       phys_addr_t boot_pgd_ptr;
        phys_addr_t pgd_ptr;
        unsigned long hyp_stack_ptr;
        unsigned long stack_page;
@@ -1048,13 +1048,12 @@ static void cpu_init_hyp_mode(void *dummy)
        /* Switch from the HYP stub to our own HYP init vector */
        __hyp_set_vectors(kvm_get_idmap_vector());
 
-       boot_pgd_ptr = kvm_mmu_get_boot_httbr();
        pgd_ptr = kvm_mmu_get_httbr();
        stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
        hyp_stack_ptr = stack_page + PAGE_SIZE;
        vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
 
-       __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
        __cpu_init_stage2();
 
        kvm_arm_init_debug();
@@ -1076,15 +1075,9 @@ static void cpu_hyp_reinit(void)
 
 static void cpu_hyp_reset(void)
 {
-       phys_addr_t boot_pgd_ptr;
-       phys_addr_t phys_idmap_start;
-
-       if (!is_kernel_in_hyp_mode()) {
-               boot_pgd_ptr = kvm_mmu_get_boot_httbr();
-               phys_idmap_start = kvm_get_idmap_start();
-
-               __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
-       }
+       if (!is_kernel_in_hyp_mode())
+               __cpu_reset_hyp_mode(hyp_default_vectors,
+                                    kvm_get_idmap_start());
 }
 
 static void _kvm_arch_hardware_enable(void *discard)
@@ -1294,14 +1287,14 @@ static int init_hyp_mode(void)
         * Map the Hyp-code called directly from the host
         */
        err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-                                 kvm_ksym_ref(__hyp_text_end));
+                                 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
        if (err) {
                kvm_err("Cannot map world-switch code\n");
                goto out_err;
        }
 
        err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-                                 kvm_ksym_ref(__end_rodata));
+                                 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
        if (err) {
                kvm_err("Cannot map rodata section\n");
                goto out_err;
@@ -1312,7 +1305,8 @@ static int init_hyp_mode(void)
         */
        for_each_possible_cpu(cpu) {
                char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE);
+               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+                                         PAGE_HYP);
 
                if (err) {
                        kvm_err("Cannot map hyp stack\n");
@@ -1324,7 +1318,7 @@ static int init_hyp_mode(void)
                kvm_cpu_context_t *cpu_ctxt;
 
                cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
-               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
+               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
 
                if (err) {
                        kvm_err("Cannot map host CPU state: %d\n", err);
@@ -1332,10 +1326,6 @@ static int init_hyp_mode(void)
                }
        }
 
-#ifndef CONFIG_HOTPLUG_CPU
-       free_boot_hyp_pgd();
-#endif
-
        /* set size of VMID supported by CPU */
        kvm_vmid_bits = kvm_get_vmid_bits();
        kvm_info("%d-bit VMID\n", kvm_vmid_bits);
index a494def..af93e3f 100644 (file)
@@ -210,7 +210,7 @@ bool kvm_condition_valid(struct kvm_vcpu *vcpu)
  * @vcpu:      The VCPU pointer
  *
  * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanved (updated), so we have
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
  * to do this little bit of work manually. The fields map like this:
  *
  * IT[7:0] -> CPSR[26:25],CPSR[15:10]
index 9093ed0..9aca920 100644 (file)
@@ -182,7 +182,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
 /**
  * kvm_arm_copy_reg_indices - get indices of all registers.
  *
- * We do core registers right here, then we apppend coproc regs.
+ * We do core registers right here, then we append coproc regs.
  */
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 {
index 1f9ae17..bf89c91 100644 (file)
  *       r2,r3 = Hypervisor pgd pointer
  *
  * The init scenario is:
- * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
- *   runtime stack, runtime vectors
- * - Enable the MMU with the boot pgd
- * - Jump to a target into the trampoline page (remember, this is the same
- *   physical page!)
- * - Now switch to the runtime pgd (same VA, and still the same physical
- *   page!)
+ * - We jump in HYP with 3 parameters: runtime HYP pgd, runtime stack,
+ *   runtime vectors
  * - Invalidate TLBs
  * - Set stack and vectors
+ * - Setup the page tables
+ * - Enable the MMU
  * - Profit! (or eret, if you only care about the code).
- *
- * As we only have four registers available to pass parameters (and we
- * need six), we split the init in two phases:
- * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD.
- *   Provides the basic HYP init, and enable the MMU.
- * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD.
- *   Switches to the runtime PGD, set stack and vectors.
  */
 
        .text
@@ -68,8 +58,11 @@ __kvm_hyp_init:
        W(b)    .
 
 __do_hyp_init:
-       cmp     r0, #0                  @ We have a SP?
-       bne     phase2                  @ Yes, second stage init
+       @ Set stack pointer
+       mov     sp, r0
+
+       @ Set HVBAR to point to the HYP vectors
+       mcr     p15, 4, r1, c12, c0, 0  @ HVBAR
 
        @ Set the HTTBR to point to the hypervisor PGD pointer passed
        mcrr    p15, 4, rr_lo_hi(r2, r3), c2
@@ -114,34 +107,25 @@ __do_hyp_init:
  THUMB(        ldr     r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE)          )
        orr     r1, r1, r2
        orr     r0, r0, r1
-       isb
        mcr     p15, 4, r0, c1, c0, 0   @ HSCR
+       isb
 
-       @ End of init phase-1
        eret
 
-phase2:
-       @ Set stack pointer
-       mov     sp, r0
-
-       @ Set HVBAR to point to the HYP vectors
-       mcr     p15, 4, r1, c12, c0, 0  @ HVBAR
-
-       @ Jump to the trampoline page
-       ldr     r0, =TRAMPOLINE_VA
-       adr     r1, target
-       bfi     r0, r1, #0, #PAGE_SHIFT
-       ret     r0
+       @ r0 : stub vectors address
+ENTRY(__kvm_hyp_reset)
+       /* We're now in idmap, disable MMU */
+       mrc     p15, 4, r1, c1, c0, 0   @ HSCTLR
+       ldr     r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)
+       bic     r1, r1, r2
+       mcr     p15, 4, r1, c1, c0, 0   @ HSCTLR
 
-target:        @ We're now in the trampoline code, switch page tables
-       mcrr    p15, 4, rr_lo_hi(r2, r3), c2
+       /* Install stub vectors */
+       mcr     p15, 4, r0, c12, c0, 0  @ HVBAR
        isb
 
-       @ Invalidate the old TLBs
-       mcr     p15, 4, r0, c8, c7, 0   @ TLBIALLH
-       dsb     ish
-
        eret
+ENDPROC(__kvm_hyp_reset)
 
        .ltorg
 
index 45c43ae..bda27b6 100644 (file)
@@ -32,8 +32,6 @@
 
 #include "trace.h"
 
-extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
-
 static pgd_t *boot_hyp_pgd;
 static pgd_t *hyp_pgd;
 static pgd_t *merged_hyp_pgd;
@@ -483,28 +481,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
        } while (pgd++, addr = next, addr != end);
 }
 
-/**
- * free_boot_hyp_pgd - free HYP boot page tables
- *
- * Free the HYP boot page tables. The bounce page is also freed.
- */
-void free_boot_hyp_pgd(void)
-{
-       mutex_lock(&kvm_hyp_pgd_mutex);
-
-       if (boot_hyp_pgd) {
-               unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-               unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
-               free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-               boot_hyp_pgd = NULL;
-       }
-
-       if (hyp_pgd)
-               unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
-
-       mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
 /**
  * free_hyp_pgds - free Hyp-mode page tables
  *
@@ -519,15 +495,20 @@ void free_hyp_pgds(void)
 {
        unsigned long addr;
 
-       free_boot_hyp_pgd();
-
        mutex_lock(&kvm_hyp_pgd_mutex);
 
+       if (boot_hyp_pgd) {
+               unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+               free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
+               boot_hyp_pgd = NULL;
+       }
+
        if (hyp_pgd) {
+               unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
                for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-                       unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+                       unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
                for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-                       unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+                       unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
 
                free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
                hyp_pgd = NULL;
@@ -679,17 +660,18 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
  * @from:      The virtual kernel start address of the range
  * @to:                The virtual kernel end address of the range (exclusive)
+ * @prot:      The protection to be applied to this range
  *
  * The same virtual address as the kernel virtual address is also used
  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
  * physical pages.
  */
-int create_hyp_mappings(void *from, void *to)
+int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 {
        phys_addr_t phys_addr;
        unsigned long virt_addr;
-       unsigned long start = KERN_TO_HYP((unsigned long)from);
-       unsigned long end = KERN_TO_HYP((unsigned long)to);
+       unsigned long start = kern_hyp_va((unsigned long)from);
+       unsigned long end = kern_hyp_va((unsigned long)to);
 
        if (is_kernel_in_hyp_mode())
                return 0;
@@ -704,7 +686,7 @@ int create_hyp_mappings(void *from, void *to)
                err = __create_hyp_mappings(hyp_pgd, virt_addr,
                                            virt_addr + PAGE_SIZE,
                                            __phys_to_pfn(phys_addr),
-                                           PAGE_HYP);
+                                           prot);
                if (err)
                        return err;
        }
@@ -723,8 +705,8 @@ int create_hyp_mappings(void *from, void *to)
  */
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 {
-       unsigned long start = KERN_TO_HYP((unsigned long)from);
-       unsigned long end = KERN_TO_HYP((unsigned long)to);
+       unsigned long start = kern_hyp_va((unsigned long)from);
+       unsigned long end = kern_hyp_va((unsigned long)to);
 
        if (is_kernel_in_hyp_mode())
                return 0;
@@ -1687,14 +1669,6 @@ phys_addr_t kvm_mmu_get_httbr(void)
                return virt_to_phys(hyp_pgd);
 }
 
-phys_addr_t kvm_mmu_get_boot_httbr(void)
-{
-       if (__kvm_cpu_uses_extended_idmap())
-               return virt_to_phys(merged_hyp_pgd);
-       else
-               return virt_to_phys(boot_hyp_pgd);
-}
-
 phys_addr_t kvm_get_idmap_vector(void)
 {
        return hyp_idmap_vector;
@@ -1705,6 +1679,22 @@ phys_addr_t kvm_get_idmap_start(void)
        return hyp_idmap_start;
 }
 
+static int kvm_map_idmap_text(pgd_t *pgd)
+{
+       int err;
+
+       /* Create the idmap in the boot page tables */
+       err =   __create_hyp_mappings(pgd,
+                                     hyp_idmap_start, hyp_idmap_end,
+                                     __phys_to_pfn(hyp_idmap_start),
+                                     PAGE_HYP_EXEC);
+       if (err)
+               kvm_err("Failed to idmap %lx-%lx\n",
+                       hyp_idmap_start, hyp_idmap_end);
+
+       return err;
+}
+
 int kvm_mmu_init(void)
 {
        int err;
@@ -1719,28 +1709,41 @@ int kvm_mmu_init(void)
         */
        BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
-       hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-       boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+       kvm_info("IDMAP page: %lx\n", hyp_idmap_start);
+       kvm_info("HYP VA range: %lx:%lx\n",
+                kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL));
 
-       if (!hyp_pgd || !boot_hyp_pgd) {
-               kvm_err("Hyp mode PGD not allocated\n");
-               err = -ENOMEM;
+       if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
+           hyp_idmap_start <  kern_hyp_va(~0UL)) {
+               /*
+                * The idmap page is intersecting with the VA space,
+                * it is not safe to continue further.
+                */
+               kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
+               err = -EINVAL;
                goto out;
        }
 
-       /* Create the idmap in the boot page tables */
-       err =   __create_hyp_mappings(boot_hyp_pgd,
-                                     hyp_idmap_start, hyp_idmap_end,
-                                     __phys_to_pfn(hyp_idmap_start),
-                                     PAGE_HYP);
-
-       if (err) {
-               kvm_err("Failed to idmap %lx-%lx\n",
-                       hyp_idmap_start, hyp_idmap_end);
+       hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+       if (!hyp_pgd) {
+               kvm_err("Hyp mode PGD not allocated\n");
+               err = -ENOMEM;
                goto out;
        }
 
        if (__kvm_cpu_uses_extended_idmap()) {
+               boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                                        hyp_pgd_order);
+               if (!boot_hyp_pgd) {
+                       kvm_err("Hyp boot PGD not allocated\n");
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               err = kvm_map_idmap_text(boot_hyp_pgd);
+               if (err)
+                       goto out;
+
                merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
                if (!merged_hyp_pgd) {
                        kvm_err("Failed to allocate extra HYP pgd\n");
@@ -1748,29 +1751,10 @@ int kvm_mmu_init(void)
                }
                __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
                                    hyp_idmap_start);
-               return 0;
-       }
-
-       /* Map the very same page at the trampoline VA */
-       err =   __create_hyp_mappings(boot_hyp_pgd,
-                                     TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
-                                     __phys_to_pfn(hyp_idmap_start),
-                                     PAGE_HYP);
-       if (err) {
-               kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
-                       TRAMPOLINE_VA);
-               goto out;
-       }
-
-       /* Map the same page again into the runtime page tables */
-       err =   __create_hyp_mappings(hyp_pgd,
-                                     TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
-                                     __phys_to_pfn(hyp_idmap_start),
-                                     PAGE_HYP);
-       if (err) {
-               kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
-                       TRAMPOLINE_VA);
-               goto out;
+       } else {
+               err = kvm_map_idmap_text(hyp_pgd);
+               if (err)
+                       goto out;
        }
 
        return 0;
index 0048b5a..4b5e802 100644 (file)
@@ -52,7 +52,7 @@ static const struct kvm_irq_level cortexa_vtimer_irq = {
  * @vcpu: The VCPU pointer
  *
  * This function finds the right table above and sets the registers on the
- * virtual CPU struct to their architectually defined reset values.
+ * virtual CPU struct to their architecturally defined reset values.
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {
index 9f8b99e..69c8787 100644 (file)
@@ -3,6 +3,7 @@ config ARM64
        select ACPI_CCA_REQUIRED if ACPI
        select ACPI_GENERIC_GSI if ACPI
        select ACPI_REDUCED_HARDWARE_ONLY if ACPI
+       select ACPI_MCFG if ACPI
        select ARCH_HAS_DEVMEM_IS_ALLOWED
        select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
@@ -22,9 +23,9 @@ config ARM64
        select ARM_ARCH_TIMER
        select ARM_GIC
        select AUDIT_ARCH_COMPAT_GENERIC
-       select ARM_GIC_V2M if PCI_MSI
+       select ARM_GIC_V2M if PCI
        select ARM_GIC_V3
-       select ARM_GIC_V3_ITS if PCI_MSI
+       select ARM_GIC_V3_ITS if PCI
        select ARM_PSCI_FW
        select BUILDTIME_EXTABLE_SORT
        select CLONE_BACKWARDS
@@ -78,6 +79,7 @@ config ARM64
        select HAVE_FTRACE_MCOUNT_RECORD
        select HAVE_FUNCTION_TRACER
        select HAVE_FUNCTION_GRAPH_TRACER
+       select HAVE_GCC_PLUGINS
        select HAVE_GENERIC_DMA_COHERENT
        select HAVE_HW_BREAKPOINT if PERF_EVENTS
        select HAVE_IRQ_TIME_ACCOUNTING
@@ -101,6 +103,7 @@ config ARM64
        select OF_EARLY_FLATTREE
        select OF_NUMA if NUMA && OF
        select OF_RESERVED_MEM
+       select PCI_ECAM if ACPI
        select PERF_USE_VMALLOC
        select POWER_RESET
        select POWER_SUPPLY
index 86110a6..1372e9a 100644 (file)
@@ -76,3 +76,8 @@
 &usb3 {
        status = "okay";
 };
+
+/* CON17 (PCIe) / CON12 (mini-PCIe) */
+&pcie0 {
+       status = "okay";
+};
index eb29280..c476253 100644 (file)
                                      <0x1d40000 0x40000>; /* GICR */
                        };
                };
+
+               pcie0: pcie@d0070000 {
+                       compatible = "marvell,armada-3700-pcie";
+                       device_type = "pci";
+                       status = "disabled";
+                       reg = <0 0xd0070000 0 0x20000>;
+                       #address-cells = <3>;
+                       #size-cells = <2>;
+                       bus-range = <0x00 0xff>;
+                       interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>;
+                       #interrupt-cells = <1>;
+                       msi-parent = <&pcie0>;
+                       msi-controller;
+                       ranges = <0x82000000 0 0xe8000000   0 0xe8000000 0 0x1000000 /* Port 0 MEM */
+                                 0x81000000 0 0xe9000000   0 0xe9000000 0 0x10000>; /* Port 0 IO*/
+                       interrupt-map-mask = <0 0 0 7>;
+                       interrupt-map = <0 0 0 1 &pcie_intc 0>,
+                                       <0 0 0 2 &pcie_intc 1>,
+                                       <0 0 0 3 &pcie_intc 2>,
+                                       <0 0 0 4 &pcie_intc 3>;
+                       pcie_intc: interrupt-controller {
+                               interrupt-controller;
+                               #interrupt-cells = <1>;
+                       };
+               };
        };
 };
index 49dd1bd..7099f26 100644 (file)
@@ -36,8 +36,9 @@
 #define ARM64_HAS_VIRT_HOST_EXTN               11
 #define ARM64_WORKAROUND_CAVIUM_27456          12
 #define ARM64_HAS_32BIT_EL0                    13
+#define ARM64_HYP_OFFSET_LOW                   14
 
-#define ARM64_NCAPS                            14
+#define ARM64_NCAPS                            15
 
 #ifndef __ASSEMBLY__
 
index 2cdb6b5..4b5c977 100644 (file)
 /* Hyp System Trap Register */
 #define HSTR_EL2_T(x)  (1 << x)
 
-/* Hyp Coproccessor Trap Register Shifts */
+/* Hyp Coprocessor Trap Register Shifts */
 #define CPTR_EL2_TFP_SHIFT 10
 
 /* Hyp Coprocessor Trap Register */
index 49095fc..3eda975 100644 (file)
@@ -47,8 +47,7 @@
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(long ext);
-unsigned long kvm_hyp_reset_entry(void);
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
 
 struct kvm_arch {
@@ -348,8 +347,7 @@ int kvm_perf_teardown(void);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
-                                      phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
                                       unsigned long hyp_stack_ptr,
                                       unsigned long vector_ptr)
 {
@@ -357,19 +355,14 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
         * Call initialization code, and switch to the full blown
         * HYP code.
         */
-       __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
-                      hyp_stack_ptr, vector_ptr);
+       __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
 }
 
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+void __kvm_hyp_teardown(void);
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
                                        phys_addr_t phys_idmap_start)
 {
-       /*
-        * Call reset code, and switch back to stub hyp vectors.
-        * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
-        */
-       __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
-                      boot_pgd_ptr, phys_idmap_start);
+       kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start);
 }
 
 static inline void kvm_arch_hardware_unsetup(void) {}
index 44eaff7..cff5105 100644 (file)
 
 #define __hyp_text __section(.hyp.text) notrace
 
-static inline unsigned long __kern_hyp_va(unsigned long v)
-{
-       asm volatile(ALTERNATIVE("and %0, %0, %1",
-                                "nop",
-                                ARM64_HAS_VIRT_HOST_EXTN)
-                    : "+r" (v) : "i" (HYP_PAGE_OFFSET_MASK));
-       return v;
-}
-
-#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
-
-static inline unsigned long __hyp_kern_va(unsigned long v)
-{
-       u64 offset = PAGE_OFFSET - HYP_PAGE_OFFSET;
-       asm volatile(ALTERNATIVE("add %0, %0, %1",
-                                "nop",
-                                ARM64_HAS_VIRT_HOST_EXTN)
-                    : "+r" (v) : "r" (offset));
-       return v;
-}
-
-#define hyp_kern_va(v) (typeof(v))(__hyp_kern_va((unsigned long)(v)))
-
 #define read_sysreg_elx(r,nvh,vh)                                      \
        ({                                                              \
                u64 reg;                                                \
index f05ac27..b6bb834 100644 (file)
  *
  * Instead, give the HYP mode its own VA region at a fixed offset from
  * the kernel by just masking the top bits (which are all ones for a
- * kernel address).
+ * kernel address). We need to find out how many bits to mask.
  *
- * ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these
- * macros (the entire kernel runs at EL2).
+ * We want to build a set of page tables that cover both parts of the
+ * idmap (the trampoline page used to initialize EL2), and our normal
+ * runtime VA space, at the same time.
+ *
+ * Given that the kernel uses VA_BITS for its entire address space,
+ * and that half of that space (VA_BITS - 1) is used for the linear
+ * mapping, we can also limit the EL2 space to (VA_BITS - 1).
+ *
+ * The main question is "Within the VA_BITS space, does EL2 use the
+ * top or the bottom half of that space to shadow the kernel's linear
+ * mapping?". As we need to idmap the trampoline page, this is
+ * determined by the range in which this page lives.
+ *
+ * If the page is in the bottom half, we have to use the top half. If
+ * the page is in the top half, we have to use the bottom half:
+ *
+ * T = __virt_to_phys(__hyp_idmap_text_start)
+ * if (T & BIT(VA_BITS - 1))
+ *     HYP_VA_MIN = 0  //idmap in upper half
+ * else
+ *     HYP_VA_MIN = 1 << (VA_BITS - 1)
+ * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
+ *
+ * This of course assumes that the trampoline page exists within the
+ * VA_BITS range. If it doesn't, then it means we're in the odd case
+ * where the kernel idmap (as well as HYP) uses more levels than the
+ * kernel runtime page tables (as seen when the kernel is configured
+ * for 4k pages, 39bits VA, and yet memory lives just above that
+ * limit, forcing the idmap to use 4 levels of page tables while the
+ * kernel itself only uses 3). In this particular case, it doesn't
+ * matter which side of VA_BITS we use, as we're guaranteed not to
+ * conflict with anything.
+ *
+ * When using VHE, there are no separate hyp mappings and all KVM
+ * functionality is already mapped as part of the main kernel
+ * mappings, and none of this applies in that case.
  */
-#define HYP_PAGE_OFFSET_SHIFT  VA_BITS
-#define HYP_PAGE_OFFSET_MASK   ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
-#define HYP_PAGE_OFFSET                (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK)
 
-/*
- * Our virtual mapping for the idmap-ed MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the last
- * possible page, where no kernel mapping will ever exist.
- */
-#define TRAMPOLINE_VA          (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
+#define HYP_PAGE_OFFSET_HIGH_MASK      ((UL(1) << VA_BITS) - 1)
+#define HYP_PAGE_OFFSET_LOW_MASK       ((UL(1) << (VA_BITS - 1)) - 1)
 
 #ifdef __ASSEMBLY__
 
 /*
  * Convert a kernel VA into a HYP VA.
  * reg: VA to be converted.
+ *
+ * This generates the following sequences:
+ * - High mask:
+ *             and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ *             nop
+ * - Low mask:
+ *             and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ *             and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK
+ * - VHE:
+ *             nop
+ *             nop
+ *
+ * The "low mask" version works because the mask is a strict subset of
+ * the "high mask", hence performing the first mask for nothing.
+ * Should be completely invisible on any viable CPU.
  */
 .macro kern_hyp_va     reg
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN    
-       and     \reg, \reg, #HYP_PAGE_OFFSET_MASK
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+       and     \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK
 alternative_else
        nop
 alternative_endif
+alternative_if_not ARM64_HYP_OFFSET_LOW
+       nop
+alternative_else
+       and     \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK
+alternative_endif
 .endm
 
 #else
@@ -70,7 +117,22 @@ alternative_endif
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 
-#define KERN_TO_HYP(kva)       ((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET)
+static inline unsigned long __kern_hyp_va(unsigned long v)
+{
+       asm volatile(ALTERNATIVE("and %0, %0, %1",
+                                "nop",
+                                ARM64_HAS_VIRT_HOST_EXTN)
+                    : "+r" (v)
+                    : "i" (HYP_PAGE_OFFSET_HIGH_MASK));
+       asm volatile(ALTERNATIVE("nop",
+                                "and %0, %0, %1",
+                                ARM64_HYP_OFFSET_LOW)
+                    : "+r" (v)
+                    : "i" (HYP_PAGE_OFFSET_LOW_MASK));
+       return v;
+}
+
+#define kern_hyp_va(v)         (typeof(v))(__kern_hyp_va((unsigned long)(v)))
 
 /*
  * We currently only support a 40bit IPA.
@@ -81,9 +143,8 @@ alternative_endif
 
 #include <asm/stage2_pgtable.h>
 
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
 
 void stage2_unmap_vm(struct kvm *kvm);
@@ -97,7 +158,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
index 2813748..c3ae239 100644 (file)
 #define PTE_CONT               (_AT(pteval_t, 1) << 52)        /* Contiguous range */
 #define PTE_PXN                        (_AT(pteval_t, 1) << 53)        /* Privileged XN */
 #define PTE_UXN                        (_AT(pteval_t, 1) << 54)        /* User XN */
+#define PTE_HYP_XN             (_AT(pteval_t, 1) << 54)        /* HYP XN */
 
 /*
  * AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers).
index 29fcb33..39f5252 100644 (file)
@@ -55,7 +55,9 @@
 #define PAGE_KERNEL_EXEC       __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_EXEC_CONT  __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
 
-#define PAGE_HYP               __pgprot(_PAGE_DEFAULT | PTE_HYP)
+#define PAGE_HYP               __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
+#define PAGE_HYP_EXEC          __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
+#define PAGE_HYP_RO            __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
 #define PAGE_HYP_DEVICE                __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
 
 #define PAGE_S2                        __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
index bbc6a8c..1788545 100644 (file)
@@ -87,6 +87,10 @@ extern void verify_cpu_run_el(void);
 static inline void verify_cpu_run_el(void) {}
 #endif
 
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
 /* The section containing the hypervisor text */
 extern char __hyp_text_start[];
 extern char __hyp_text_end[];
index f209ea1..3051f86 100644 (file)
@@ -87,9 +87,11 @@ struct kvm_regs {
 /* Supported VGICv3 address types  */
 #define KVM_VGIC_V3_ADDR_TYPE_DIST     2
 #define KVM_VGIC_V3_ADDR_TYPE_REDIST   3
+#define KVM_VGIC_ITS_ADDR_TYPE         4
 
 #define KVM_VGIC_V3_DIST_SIZE          SZ_64K
 #define KVM_VGIC_V3_REDIST_SIZE                (2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE           (2 * SZ_64K)
 
 #define KVM_ARM_VCPU_POWER_OFF         0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_EL1_32BIT         1 /* CPU running a 32bit VM */
index 916d27a..62272ea 100644 (file)
@@ -726,6 +726,19 @@ static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused
        return is_kernel_in_hyp_mode();
 }
 
+static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
+                          int __unused)
+{
+       phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start);
+
+       /*
+        * Activate the lower HYP offset only if:
+        * - the idmap doesn't clash with it,
+        * - the kernel is not running at EL2.
+        */
+       return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
        {
                .desc = "GIC system register CPU interface",
@@ -803,6 +816,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                .field_pos = ID_AA64PFR0_EL0_SHIFT,
                .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
        },
+       {
+               .desc = "Reduced HYP mapping offset",
+               .capability = ARM64_HYP_OFFSET_LOW,
+               .def_scope = SCOPE_SYSTEM,
+               .matches = hyp_offset_low,
+       },
        {},
 };
 
index 3c4e308..acf3872 100644 (file)
@@ -17,6 +17,9 @@
 #include <linux/mm.h>
 #include <linux/of_pci.h>
 #include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/pci-acpi.h>
+#include <linux/pci-ecam.h>
 #include <linux/slab.h>
 
 /*
@@ -36,25 +39,17 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
        return res->start;
 }
 
-/**
- * pcibios_enable_device - Enable I/O and memory.
- * @dev: PCI device to be enabled
- * @mask: bitmask of BARs to enable
- */
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-       if (pci_has_flag(PCI_PROBE_ONLY))
-               return 0;
-
-       return pci_enable_resources(dev, mask);
-}
-
 /*
- * Try to assign the IRQ number from DT when adding a new device
+ * Try to assign the IRQ number when probing a new device
  */
-int pcibios_add_device(struct pci_dev *dev)
+int pcibios_alloc_irq(struct pci_dev *dev)
 {
-       dev->irq = of_irq_parse_and_map_pci(dev, 0, 0);
+       if (acpi_disabled)
+               dev->irq = of_irq_parse_and_map_pci(dev, 0, 0);
+#ifdef CONFIG_ACPI
+       else
+               return acpi_pci_irq_enable(dev);
+#endif
 
        return 0;
 }
@@ -65,13 +60,21 @@ int pcibios_add_device(struct pci_dev *dev)
 int raw_pci_read(unsigned int domain, unsigned int bus,
                  unsigned int devfn, int reg, int len, u32 *val)
 {
-       return -ENXIO;
+       struct pci_bus *b = pci_find_bus(domain, bus);
+
+       if (!b)
+               return PCIBIOS_DEVICE_NOT_FOUND;
+       return b->ops->read(b, devfn, reg, len, val);
 }
 
 int raw_pci_write(unsigned int domain, unsigned int bus,
                unsigned int devfn, int reg, int len, u32 val)
 {
-       return -ENXIO;
+       struct pci_bus *b = pci_find_bus(domain, bus);
+
+       if (!b)
+               return PCIBIOS_DEVICE_NOT_FOUND;
+       return b->ops->write(b, devfn, reg, len, val);
 }
 
 #ifdef CONFIG_NUMA
@@ -85,10 +88,124 @@ EXPORT_SYMBOL(pcibus_to_node);
 #endif
 
 #ifdef CONFIG_ACPI
-/* Root bridge scanning */
+
+struct acpi_pci_generic_root_info {
+       struct acpi_pci_root_info       common;
+       struct pci_config_window        *cfg;   /* config space mapping */
+};
+
+int acpi_pci_bus_find_domain_nr(struct pci_bus *bus)
+{
+       struct pci_config_window *cfg = bus->sysdata;
+       struct acpi_device *adev = to_acpi_device(cfg->parent);
+       struct acpi_pci_root *root = acpi_driver_data(adev);
+
+       return root->segment;
+}
+
+int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+       if (!acpi_disabled) {
+               struct pci_config_window *cfg = bridge->bus->sysdata;
+               struct acpi_device *adev = to_acpi_device(cfg->parent);
+               ACPI_COMPANION_SET(&bridge->dev, adev);
+       }
+
+       return 0;
+}
+
+/*
+ * Lookup the bus range for the domain in MCFG, and set up config space
+ * mapping.
+ */
+static struct pci_config_window *
+pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root)
+{
+       struct resource *bus_res = &root->secondary;
+       u16 seg = root->segment;
+       struct pci_config_window *cfg;
+       struct resource cfgres;
+       unsigned int bsz;
+
+       /* Use address from _CBA if present, otherwise lookup MCFG */
+       if (!root->mcfg_addr)
+               root->mcfg_addr = pci_mcfg_lookup(seg, bus_res);
+
+       if (!root->mcfg_addr) {
+               dev_err(&root->device->dev, "%04x:%pR ECAM region not found\n",
+                       seg, bus_res);
+               return NULL;
+       }
+
+       bsz = 1 << pci_generic_ecam_ops.bus_shift;
+       cfgres.start = root->mcfg_addr + bus_res->start * bsz;
+       cfgres.end = cfgres.start + resource_size(bus_res) * bsz - 1;
+       cfgres.flags = IORESOURCE_MEM;
+       cfg = pci_ecam_create(&root->device->dev, &cfgres, bus_res,
+                             &pci_generic_ecam_ops);
+       if (IS_ERR(cfg)) {
+               dev_err(&root->device->dev, "%04x:%pR error %ld mapping ECAM\n",
+                       seg, bus_res, PTR_ERR(cfg));
+               return NULL;
+       }
+
+       return cfg;
+}
+
+/* release_info: free resources allocated by init_info */
+static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci)
+{
+       struct acpi_pci_generic_root_info *ri;
+
+       ri = container_of(ci, struct acpi_pci_generic_root_info, common);
+       pci_ecam_free(ri->cfg);
+       kfree(ri);
+}
+
+static struct acpi_pci_root_ops acpi_pci_root_ops = {
+       .release_info = pci_acpi_generic_release_info,
+};
+
+/* Interface called from ACPI code to setup PCI host controller */
 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 {
-       /* TODO: Should be revisited when implementing PCI on ACPI */
-       return NULL;
+       int node = acpi_get_node(root->device->handle);
+       struct acpi_pci_generic_root_info *ri;
+       struct pci_bus *bus, *child;
+
+       ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node);
+       if (!ri)
+               return NULL;
+
+       ri->cfg = pci_acpi_setup_ecam_mapping(root);
+       if (!ri->cfg) {
+               kfree(ri);
+               return NULL;
+       }
+
+       acpi_pci_root_ops.pci_ops = &ri->cfg->ops->pci_ops;
+       bus = acpi_pci_root_create(root, &acpi_pci_root_ops, &ri->common,
+                                  ri->cfg);
+       if (!bus)
+               return NULL;
+
+       pci_bus_size_bridges(bus);
+       pci_bus_assign_resources(bus);
+
+       list_for_each_entry(child, &bus->children, node)
+               pcie_bus_configure_settings(child);
+
+       return bus;
 }
+
+void pcibios_add_bus(struct pci_bus *bus)
+{
+       acpi_pci_add_bus(bus);
+}
+
+void pcibios_remove_bus(struct pci_bus *bus)
+{
+       acpi_pci_remove_bus(bus);
+}
+
 #endif
index c4f26ef..9d2eff0 100644 (file)
@@ -36,6 +36,7 @@ config KVM
        select HAVE_KVM_IRQFD
        select KVM_ARM_VGIC_V3
        select KVM_ARM_PMU if HW_PERF_EVENTS
+       select HAVE_KVM_MSI
        ---help---
          Support hosting virtualized guest machines.
          We don't support KVM with 16K page tables yet, due to the multiple
@@ -54,13 +55,6 @@ config KVM_ARM_PMU
          Adds support for a virtual Performance Monitoring Unit (PMU) in
          virtual machines.
 
-config KVM_NEW_VGIC
-       bool "New VGIC implementation"
-       depends on KVM
-       default y
-        ---help---
-          uses the new VGIC implementation
-
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index a7a958c..a5b9664 100644 (file)
@@ -20,7 +20,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,12 +29,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
-endif
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
 kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
index 32fad75..3f9e157 100644 (file)
@@ -211,7 +211,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
 /**
  * kvm_arm_copy_reg_indices - get indices of all registers.
  *
- * We do core registers right here, then we apppend system regs.
+ * We do core registers right here, then we append system regs.
  */
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 {
index a873a6d..6b29d3d 100644 (file)
@@ -53,10 +53,9 @@ __invalid:
        b       .
 
        /*
-        * x0: HYP boot pgd
-        * x1: HYP pgd
-        * x2: HYP stack
-        * x3: HYP vectors
+        * x0: HYP pgd
+        * x1: HYP stack
+        * x2: HYP vectors
         */
 __do_hyp_init:
 
@@ -110,71 +109,27 @@ __do_hyp_init:
        msr     sctlr_el2, x4
        isb
 
-       /* Skip the trampoline dance if we merged the boot and runtime PGDs */
-       cmp     x0, x1
-       b.eq    merged
-
-       /* MMU is now enabled. Get ready for the trampoline dance */
-       ldr     x4, =TRAMPOLINE_VA
-       adr     x5, target
-       bfi     x4, x5, #0, #PAGE_SHIFT
-       br      x4
-
-target: /* We're now in the trampoline code, switch page tables */
-       msr     ttbr0_el2, x1
-       isb
-
-       /* Invalidate the old TLBs */
-       tlbi    alle2
-       dsb     sy
-
-merged:
        /* Set the stack and new vectors */
+       kern_hyp_va     x1
+       mov     sp, x1
        kern_hyp_va     x2
-       mov     sp, x2
-       kern_hyp_va     x3
-       msr     vbar_el2, x3
+       msr     vbar_el2, x2
 
        /* Hello, World! */
        eret
 ENDPROC(__kvm_hyp_init)
 
        /*
-        * Reset kvm back to the hyp stub. This is the trampoline dance in
-        * reverse. If kvm used an extended idmap, __extended_idmap_trampoline
-        * calls this code directly in the idmap. In this case switching to the
-        * boot tables is a no-op.
-        *
-        * x0: HYP boot pgd
-        * x1: HYP phys_idmap_start
+        * Reset kvm back to the hyp stub.
         */
 ENTRY(__kvm_hyp_reset)
-       /* We're in trampoline code in VA, switch back to boot page tables */
-       msr     ttbr0_el2, x0
-       isb
-
-       /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */
-       ic      iallu
-       tlbi    alle2
-       dsb     sy
-       isb
-
-       /* Branch into PA space */
-       adr     x0, 1f
-       bfi     x1, x0, #0, #PAGE_SHIFT
-       br      x1
-
        /* We're now in idmap, disable MMU */
-1:     mrs     x0, sctlr_el2
+       mrs     x0, sctlr_el2
        ldr     x1, =SCTLR_ELx_FLAGS
        bic     x0, x0, x1              // Clear SCTL_M and etc
        msr     sctlr_el2, x0
        isb
 
-       /* Invalidate the old TLBs */
-       tlbi    alle2
-       dsb     sy
-
        /* Install stub vectors */
        adr_l   x0, __hyp_stub_vectors
        msr     vbar_el2, x0
index 70254a6..ce9e5e5 100644 (file)
@@ -164,22 +164,3 @@ alternative_endif
 
        eret
 ENDPROC(__fpsimd_guest_restore)
-
-/*
- * When using the extended idmap, we don't have a trampoline page we can use
- * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap
- * directly would be ideal, but if we're using the extended idmap then the
- * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by
- * kvm_call_hyp using kern_hyp_va.
- *
- * x0: HYP boot pgd
- * x1: HYP phys_idmap_start
- */
-ENTRY(__extended_idmap_trampoline)
-       mov     x4, x1
-       adr_l   x3, __kvm_hyp_reset
-
-       /* insert __kvm_hyp_reset()s offset into phys_idmap_start */
-       bfi     x4, x3, #0, #PAGE_SHIFT
-       br      x4
-ENDPROC(__extended_idmap_trampoline)
index 2d87f36..f6d9694 100644 (file)
@@ -62,6 +62,21 @@ ENTRY(__vhe_hyp_call)
        isb
        ret
 ENDPROC(__vhe_hyp_call)
+
+/*
+ * Compute the idmap address of __kvm_hyp_reset based on the idmap
+ * start passed as a parameter, and jump there.
+ *
+ * x0: HYP phys_idmap_start
+ */
+ENTRY(__kvm_hyp_teardown)
+       mov     x4, x0
+       adr_l   x3, __kvm_hyp_reset
+
+       /* insert __kvm_hyp_reset()s offset into phys_idmap_start */
+       bfi     x4, x3, #0, #PAGE_SHIFT
+       br      x4
+ENDPROC(__kvm_hyp_teardown)
        
 el1_sync:                              // Guest trapped into EL2
        save_x0_to_x3
index 4373997..ae7855f 100644 (file)
@@ -299,9 +299,16 @@ static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%
 
 static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
 {
-       unsigned long str_va = (unsigned long)__hyp_panic_string;
+       unsigned long str_va;
 
-       __hyp_do_panic(hyp_kern_va(str_va),
+       /*
+        * Force the panic string to be loaded from the literal pool,
+        * making sure it is a kernel address and not a PC-relative
+        * reference.
+        */
+       asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va));
+
+       __hyp_do_panic(str_va,
                       spsr,  elr,
                       read_sysreg(esr_el2),   read_sysreg_el2(far),
                       read_sysreg(hpfar_el2), par,
index b1ad730..5bc4608 100644 (file)
@@ -65,7 +65,7 @@ static bool cpu_has_32bit_el1(void)
  * We currently assume that the number of HW registers is uniform
  * across all CPUs (see cpuinfo_sanity_check).
  */
-int kvm_arch_dev_ioctl_check_extension(long ext)
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 {
        int r;
 
@@ -86,6 +86,12 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
        case KVM_CAP_VCPU_ATTRIBUTES:
                r = 1;
                break;
+       case KVM_CAP_MSI_DEVID:
+               if (!kvm)
+                       r = -EINVAL;
+               else
+                       r = kvm->arch.vgic.msis_require_devid;
+               break;
        default:
                r = 0;
        }
@@ -98,7 +104,7 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
  * @vcpu: The VCPU pointer
  *
  * This function finds the right table above and sets the registers on
- * the virtual CPU struct to their architectually defined reset
+ * the virtual CPU struct to their architecturally defined reset
  * values.
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
@@ -132,31 +138,3 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        /* Reset timer */
        return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
-
-extern char __hyp_idmap_text_start[];
-
-unsigned long kvm_hyp_reset_entry(void)
-{
-       if (!__kvm_cpu_uses_extended_idmap()) {
-               unsigned long offset;
-
-               /*
-                * Find the address of __kvm_hyp_reset() in the trampoline page.
-                * This is present in the running page tables, and the boot page
-                * tables, so we call the code here to start the trampoline
-                * dance in reverse.
-                */
-               offset = (unsigned long)__kvm_hyp_reset
-                        - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
-
-               return TRAMPOLINE_VA + offset;
-       } else {
-               /*
-                * KVM is running with merged page tables, which don't have the
-                * trampoline page mapped. We know the idmap is still mapped,
-                * but can't be called into directly. Use
-                * __extended_idmap_trampoline to do the call.
-                */
-               return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline);
-       }
-}
index a57d650..b0b225c 100644 (file)
@@ -1546,7 +1546,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
                                struct sys_reg_params *params)
 {
        u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
-       int cp;
+       int cp = -1;
 
        switch(hsr_ec) {
        case ESR_ELx_EC_CP15_32:
@@ -1558,7 +1558,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
                cp = 14;
                break;
        default:
-               WARN_ON((cp = -1));
+               WARN_ON(1);
        }
 
        kvm_err("Unsupported guest CP%d access at: %08lx\n",
index 60d57c5..bdc25aa 100644 (file)
@@ -397,7 +397,7 @@ static int __init init_axis_flash(void)
        if (!romfs_in_flash) {
                /* Create an RAM device for the root partition (romfs). */
 
-#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0) || (CONFIG_MTDRAM_ABS_POS != 0)
+#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0)
                /* No use trying to boot this kernel from RAM. Panic! */
                printk(KERN_EMERG "axisflashmap: Cannot create an MTD RAM "
                       "device due to kernel (mis)configuration!\n");
index bd10d3b..87656c4 100644 (file)
@@ -320,7 +320,7 @@ static int __init init_axis_flash(void)
         * but its size must be configured as 0 so as not to conflict
         * with our usage.
         */
-#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0) || (CONFIG_MTDRAM_ABS_POS != 0)
+#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0)
        if (!romfs_in_flash && !nand_boot) {
                printk(KERN_EMERG "axisflashmap: Cannot create an MTD RAM "
                       "device; configure CONFIG_MTD_MTDRAM with size = 0!\n");
index fc3ecb5..2a120bb 100644 (file)
@@ -82,9 +82,6 @@ extern pgprot_t       pci_phys_mem_access_prot(struct file *file,
                                         pgprot_t prot);
 
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
-extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
-                                const struct resource *rsrc,
-                                resource_size_t *start, resource_size_t *end);
 
 extern void pcibios_setup_bus_devices(struct pci_bus *bus);
 extern void pcibios_setup_bus_self(struct pci_bus *bus);
index 14cba60..81556b8 100644 (file)
@@ -218,33 +218,6 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
        return NULL;
 }
 
-/*
- * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
- * device mapping.
- */
-static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
-                                     pgprot_t protection,
-                                     enum pci_mmap_state mmap_state,
-                                     int write_combine)
-{
-       pgprot_t prot = protection;
-
-       /* Write combine is always 0 on non-memory space mappings. On
-        * memory space, if the user didn't pass 1, we check for a
-        * "prefetchable" resource. This is a bit hackish, but we use
-        * this to workaround the inability of /sysfs to provide a write
-        * combine bit
-        */
-       if (mmap_state != pci_mmap_mem)
-               write_combine = 0;
-       else if (write_combine == 0) {
-               if (rp->flags & IORESOURCE_PREFETCH)
-                       write_combine = 1;
-       }
-
-       return pgprot_noncached(prot);
-}
-
 /*
  * This one is used by /dev/mem and fbdev who have no clue about the
  * PCI device, it tries to find the PCI device first and calls the
@@ -317,9 +290,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                return -EINVAL;
 
        vma->vm_pgoff = offset >> PAGE_SHIFT;
-       vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
-                                                 vma->vm_page_prot,
-                                                 mmap_state, write_combine);
+       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
        ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
                               vma->vm_end - vma->vm_start, vma->vm_page_prot);
@@ -473,39 +444,25 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar,
                          const struct resource *rsrc,
                          resource_size_t *start, resource_size_t *end)
 {
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       resource_size_t offset = 0;
+       struct pci_bus_region region;
 
-       if (hose == NULL)
+       if (rsrc->flags & IORESOURCE_IO) {
+               pcibios_resource_to_bus(dev->bus, &region,
+                                       (struct resource *) rsrc);
+               *start = region.start;
+               *end = region.end;
                return;
+       }
 
-       if (rsrc->flags & IORESOURCE_IO)
-               offset = (unsigned long)hose->io_base_virt - _IO_BASE;
-
-       /* We pass a fully fixed up address to userland for MMIO instead of
-        * a BAR value because X is lame and expects to be able to use that
-        * to pass to /dev/mem !
+       /* We pass a CPU physical address to userland for MMIO instead of a
+        * BAR value because X is lame and expects to be able to use that
+        * to pass to /dev/mem!
         *
-        * That means that we'll have potentially 64 bits values where some
-        * userland apps only expect 32 (like X itself since it thinks only
-        * Sparc has 64 bits MMIO) but if we don't do that, we break it on
-        * 32 bits CHRPs :-(
-        *
-        * Hopefully, the sysfs insterface is immune to that gunk. Once X
-        * has been fixed (and the fix spread enough), we can re-enable the
-        * 2 lines below and pass down a BAR value to userland. In that case
-        * we'll also have to re-enable the matching code in
-        * __pci_mmap_make_offset().
-        *
-        * BenH.
+        * That means we may have 64-bit values where some apps only expect
+        * 32 (like X itself since it thinks only Sparc has 64-bit MMIO).
         */
-#if 0
-       else if (rsrc->flags & IORESOURCE_MEM)
-               offset = hose->pci_mem_offset;
-#endif
-
-       *start = rsrc->start - offset;
-       *end = rsrc->end - offset;
+       *start = rsrc->start;
+       *end = rsrc->end;
 }
 
 /**
index ac91939..2986713 100644 (file)
@@ -1488,6 +1488,7 @@ config CPU_MIPS64_R2
        select CPU_SUPPORTS_HIGHMEM
        select CPU_SUPPORTS_HUGEPAGES
        select CPU_SUPPORTS_MSA
+       select HAVE_KVM
        help
          Choose this option to build a kernel for release 2 or later of the
          MIPS64 architecture.  Many modern embedded systems with a 64-bit
@@ -1505,6 +1506,7 @@ config CPU_MIPS64_R6
        select CPU_SUPPORTS_MSA
        select GENERIC_CSUM
        select MIPS_O32_FP64_SUPPORT if MIPS32_O32
+       select HAVE_KVM
        help
          Choose this option to build a kernel for release 6 or later of the
          MIPS64 architecture.  New MIPS processors, starting with the Warrior
index 3b0e51d..c5b04e7 100644 (file)
@@ -45,7 +45,7 @@
 /*
  * Returns the kernel segment base of a given address
  */
-#define KSEGX(a)               ((_ACAST32_ (a)) & 0xe0000000)
+#define KSEGX(a)               ((_ACAST32_(a)) & _ACAST32_(0xe0000000))
 
 /*
  * Returns the physical address of a CKSEGx / XKPHYS address
index 36a391d..b54bcad 100644 (file)
@@ -19,6 +19,9 @@
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 
+#include <asm/inst.h>
+#include <asm/mipsregs.h>
+
 /* MIPS KVM register ids */
 #define MIPS_CP0_32(_R, _S)                                    \
        (KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S)))
 #define KVM_REG_MIPS_CP0_CONFIG7       MIPS_CP0_32(16, 7)
 #define KVM_REG_MIPS_CP0_XCONTEXT      MIPS_CP0_64(20, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC      MIPS_CP0_64(30, 0)
+#define KVM_REG_MIPS_CP0_KSCRATCH1     MIPS_CP0_64(31, 2)
+#define KVM_REG_MIPS_CP0_KSCRATCH2     MIPS_CP0_64(31, 3)
+#define KVM_REG_MIPS_CP0_KSCRATCH3     MIPS_CP0_64(31, 4)
+#define KVM_REG_MIPS_CP0_KSCRATCH4     MIPS_CP0_64(31, 5)
+#define KVM_REG_MIPS_CP0_KSCRATCH5     MIPS_CP0_64(31, 6)
+#define KVM_REG_MIPS_CP0_KSCRATCH6     MIPS_CP0_64(31, 7)
 
 
 #define KVM_MAX_VCPUS          1
 
 
 
-/* Special address that contains the comm page, used for reducing # of traps */
-#define KVM_GUEST_COMMPAGE_ADDR                0x0
+/*
+ * Special address that contains the comm page, used for reducing # of traps
+ * This needs to be within 32Kb of 0x0 (so the zero register can be used), but
+ * preferably not at 0x0 so that most kernel NULL pointer dereferences can be
+ * caught.
+ */
+#define KVM_GUEST_COMMPAGE_ADDR                ((PAGE_SIZE > 0x8000) ? 0 : \
+                                        (0x8000 - PAGE_SIZE))
 
 #define KVM_GUEST_KERNEL_MODE(vcpu)    ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
                                        ((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
 #define KVM_INVALID_ADDR               0xdeadbeef
 
 extern atomic_t kvm_mips_instance;
-extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 
 struct kvm_vm_stat {
        u32 remote_tlb_flush;
@@ -126,28 +138,6 @@ struct kvm_vcpu_stat {
        u32 halt_wakeup;
 };
 
-enum kvm_mips_exit_types {
-       WAIT_EXITS,
-       CACHE_EXITS,
-       SIGNAL_EXITS,
-       INT_EXITS,
-       COP_UNUSABLE_EXITS,
-       TLBMOD_EXITS,
-       TLBMISS_LD_EXITS,
-       TLBMISS_ST_EXITS,
-       ADDRERR_ST_EXITS,
-       ADDRERR_LD_EXITS,
-       SYSCALL_EXITS,
-       RESVD_INST_EXITS,
-       BREAK_INST_EXITS,
-       TRAP_INST_EXITS,
-       MSA_FPE_EXITS,
-       FPE_EXITS,
-       MSA_DISABLED_EXITS,
-       FLUSH_DCACHE_EXITS,
-       MAX_KVM_MIPS_EXIT_TYPES
-};
-
 struct kvm_arch_memory_slot {
 };
 
@@ -215,73 +205,6 @@ struct mips_coproc {
 #define MIPS_CP0_CONFIG4_SEL   4
 #define MIPS_CP0_CONFIG5_SEL   5
 
-/* Config0 register bits */
-#define CP0C0_M                        31
-#define CP0C0_K23              28
-#define CP0C0_KU               25
-#define CP0C0_MDU              20
-#define CP0C0_MM               17
-#define CP0C0_BM               16
-#define CP0C0_BE               15
-#define CP0C0_AT               13
-#define CP0C0_AR               10
-#define CP0C0_MT               7
-#define CP0C0_VI               3
-#define CP0C0_K0               0
-
-/* Config1 register bits */
-#define CP0C1_M                        31
-#define CP0C1_MMU              25
-#define CP0C1_IS               22
-#define CP0C1_IL               19
-#define CP0C1_IA               16
-#define CP0C1_DS               13
-#define CP0C1_DL               10
-#define CP0C1_DA               7
-#define CP0C1_C2               6
-#define CP0C1_MD               5
-#define CP0C1_PC               4
-#define CP0C1_WR               3
-#define CP0C1_CA               2
-#define CP0C1_EP               1
-#define CP0C1_FP               0
-
-/* Config2 Register bits */
-#define CP0C2_M                        31
-#define CP0C2_TU               28
-#define CP0C2_TS               24
-#define CP0C2_TL               20
-#define CP0C2_TA               16
-#define CP0C2_SU               12
-#define CP0C2_SS               8
-#define CP0C2_SL               4
-#define CP0C2_SA               0
-
-/* Config3 Register bits */
-#define CP0C3_M                        31
-#define CP0C3_ISA_ON_EXC       16
-#define CP0C3_ULRI             13
-#define CP0C3_DSPP             10
-#define CP0C3_LPA              7
-#define CP0C3_VEIC             6
-#define CP0C3_VInt             5
-#define CP0C3_SP               4
-#define CP0C3_MT               2
-#define CP0C3_SM               1
-#define CP0C3_TL               0
-
-/* MMU types, the first four entries have the same layout as the
-   CP0C0_MT field.  */
-enum mips_mmu_types {
-       MMU_TYPE_NONE,
-       MMU_TYPE_R4000,
-       MMU_TYPE_RESERVED,
-       MMU_TYPE_FMT,
-       MMU_TYPE_R3000,
-       MMU_TYPE_R6000,
-       MMU_TYPE_R8000
-};
-
 /* Resume Flags */
 #define RESUME_FLAG_DR         (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST       (1<<1)  /* Resume host? */
@@ -298,11 +221,6 @@ enum emulation_result {
        EMULATE_PRIV_FAIL,
 };
 
-#define MIPS3_PG_G     0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
-#define MIPS3_PG_V     0x00000002 /* Valid */
-#define MIPS3_PG_NV    0x00000000
-#define MIPS3_PG_D     0x00000004 /* Dirty */
-
 #define mips3_paddr_to_tlbpfn(x) \
        (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
 #define mips3_tlbpfn_to_paddr(x) \
@@ -313,13 +231,11 @@ enum emulation_result {
 
 #define VPN2_MASK              0xffffe000
 #define KVM_ENTRYHI_ASID       MIPS_ENTRYHI_ASID
-#define TLB_IS_GLOBAL(x)       (((x).tlb_lo0 & MIPS3_PG_G) &&          \
-                                ((x).tlb_lo1 & MIPS3_PG_G))
+#define TLB_IS_GLOBAL(x)       ((x).tlb_lo[0] & (x).tlb_lo[1] & ENTRYLO_G)
 #define TLB_VPN2(x)            ((x).tlb_hi & VPN2_MASK)
 #define TLB_ASID(x)            ((x).tlb_hi & KVM_ENTRYHI_ASID)
-#define TLB_IS_VALID(x, va)    (((va) & (1 << PAGE_SHIFT))             \
-                                ? ((x).tlb_lo1 & MIPS3_PG_V)           \
-                                : ((x).tlb_lo0 & MIPS3_PG_V))
+#define TLB_LO_IDX(x, va)      (((va) >> PAGE_SHIFT) & 1)
+#define TLB_IS_VALID(x, va)    ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
 #define TLB_HI_VPN2_HIT(x, y)  ((TLB_VPN2(x) & ~(x).tlb_mask) ==       \
                                 ((y) & VPN2_MASK & ~(x).tlb_mask))
 #define TLB_HI_ASID_HIT(x, y)  (TLB_IS_GLOBAL(x) ||                    \
@@ -328,26 +244,23 @@ enum emulation_result {
 struct kvm_mips_tlb {
        long tlb_mask;
        long tlb_hi;
-       long tlb_lo0;
-       long tlb_lo1;
+       long tlb_lo[2];
 };
 
-#define KVM_MIPS_FPU_FPU       0x1
-#define KVM_MIPS_FPU_MSA       0x2
+#define KVM_MIPS_AUX_FPU       0x1
+#define KVM_MIPS_AUX_MSA       0x2
 
 #define KVM_MIPS_GUEST_TLB_SIZE        64
 struct kvm_vcpu_arch {
-       void *host_ebase, *guest_ebase;
+       void *guest_ebase;
        int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
        unsigned long host_stack;
        unsigned long host_gp;
 
        /* Host CP0 registers used when handling exits from guest */
        unsigned long host_cp0_badvaddr;
-       unsigned long host_cp0_cause;
        unsigned long host_cp0_epc;
-       unsigned long host_cp0_entryhi;
-       uint32_t guest_inst;
+       u32 host_cp0_cause;
 
        /* GPRS */
        unsigned long gprs[32];
@@ -357,8 +270,8 @@ struct kvm_vcpu_arch {
 
        /* FPU State */
        struct mips_fpu_struct fpu;
-       /* Which FPU state is loaded (KVM_MIPS_FPU_*) */
-       unsigned int fpu_inuse;
+       /* Which auxiliary state is loaded (KVM_MIPS_AUX_*) */
+       unsigned int aux_inuse;
 
        /* COP0 State */
        struct mips_coproc *cop0;
@@ -370,11 +283,11 @@ struct kvm_vcpu_arch {
 
        struct hrtimer comparecount_timer;
        /* Count timer control KVM register */
-       uint32_t count_ctl;
+       u32 count_ctl;
        /* Count bias from the raw time */
-       uint32_t count_bias;
+       u32 count_bias;
        /* Frequency of timer in Hz */
-       uint32_t count_hz;
+       u32 count_hz;
        /* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */
        s64 count_dyn_bias;
        /* Resume time */
@@ -388,7 +301,7 @@ struct kvm_vcpu_arch {
        /* Bitmask of pending exceptions to be cleared */
        unsigned long pending_exceptions_clr;
 
-       unsigned long pending_load_cause;
+       u32 pending_load_cause;
 
        /* Save/Restore the entryhi register when are are preempted/scheduled back in */
        unsigned long preempt_entryhi;
@@ -397,8 +310,8 @@ struct kvm_vcpu_arch {
        struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
 
        /* Cached guest kernel/user ASIDs */
-       uint32_t guest_user_asid[NR_CPUS];
-       uint32_t guest_kernel_asid[NR_CPUS];
+       u32 guest_user_asid[NR_CPUS];
+       u32 guest_kernel_asid[NR_CPUS];
        struct mm_struct guest_kernel_mm, guest_user_mm;
 
        int last_sched_cpu;
@@ -408,6 +321,7 @@ struct kvm_vcpu_arch {
 
        u8 fpu_enabled;
        u8 msa_enabled;
+       u8 kscratch_enabled;
 };
 
 
@@ -461,6 +375,18 @@ struct kvm_vcpu_arch {
 #define kvm_write_c0_guest_config7(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
 #define kvm_read_c0_guest_errorepc(cop0)       (cop0->reg[MIPS_CP0_ERROR_PC][0])
 #define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
+#define kvm_read_c0_guest_kscratch1(cop0)      (cop0->reg[MIPS_CP0_DESAVE][2])
+#define kvm_read_c0_guest_kscratch2(cop0)      (cop0->reg[MIPS_CP0_DESAVE][3])
+#define kvm_read_c0_guest_kscratch3(cop0)      (cop0->reg[MIPS_CP0_DESAVE][4])
+#define kvm_read_c0_guest_kscratch4(cop0)      (cop0->reg[MIPS_CP0_DESAVE][5])
+#define kvm_read_c0_guest_kscratch5(cop0)      (cop0->reg[MIPS_CP0_DESAVE][6])
+#define kvm_read_c0_guest_kscratch6(cop0)      (cop0->reg[MIPS_CP0_DESAVE][7])
+#define kvm_write_c0_guest_kscratch1(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][2] = (val))
+#define kvm_write_c0_guest_kscratch2(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][3] = (val))
+#define kvm_write_c0_guest_kscratch3(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][4] = (val))
+#define kvm_write_c0_guest_kscratch4(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][5] = (val))
+#define kvm_write_c0_guest_kscratch5(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][6] = (val))
+#define kvm_write_c0_guest_kscratch6(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][7] = (val))
 
 /*
  * Some of the guest registers may be modified asynchronously (e.g. from a
@@ -474,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
        unsigned long temp;
        do {
                __asm__ __volatile__(
-               "       .set    mips3                           \n"
+               "       .set    "MIPS_ISA_ARCH_LEVEL"           \n"
                "       " __LL "%0, %1                          \n"
                "       or      %0, %2                          \n"
                "       " __SC  "%0, %1                         \n"
@@ -490,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg,
        unsigned long temp;
        do {
                __asm__ __volatile__(
-               "       .set    mips3                           \n"
+               "       .set    "MIPS_ISA_ARCH_LEVEL"           \n"
                "       " __LL "%0, %1                          \n"
                "       and     %0, %2                          \n"
                "       " __SC  "%0, %1                         \n"
@@ -507,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
        unsigned long temp;
        do {
                __asm__ __volatile__(
-               "       .set    mips3                           \n"
+               "       .set    "MIPS_ISA_ARCH_LEVEL"           \n"
                "       " __LL "%0, %1                          \n"
                "       and     %0, %2                          \n"
                "       or      %0, %3                          \n"
@@ -542,7 +468,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
 
 static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
 {
-       return (!__builtin_constant_p(cpu_has_fpu) || cpu_has_fpu) &&
+       return (!__builtin_constant_p(raw_cpu_has_fpu) || raw_cpu_has_fpu) &&
                vcpu->fpu_enabled;
 }
 
@@ -589,9 +515,11 @@ struct kvm_mips_callbacks {
        void (*dequeue_io_int)(struct kvm_vcpu *vcpu,
                               struct kvm_mips_interrupt *irq);
        int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority,
-                          uint32_t cause);
+                          u32 cause);
        int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority,
-                        uint32_t cause);
+                        u32 cause);
+       unsigned long (*num_regs)(struct kvm_vcpu *vcpu);
+       int (*copy_reg_indices)(struct kvm_vcpu *vcpu, u64 __user *indices);
        int (*get_one_reg)(struct kvm_vcpu *vcpu,
                           const struct kvm_one_reg *reg, s64 *v);
        int (*set_one_reg)(struct kvm_vcpu *vcpu,
@@ -605,8 +533,13 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
 /* Debug: dump vcpu state */
 int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
 
-/* Trampoline ASM routine to start running in "Guest" context */
-extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+/* Building of entry/exception code */
+int kvm_mips_entry_setup(void);
+void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_exception(void *addr, void *handler);
+void *kvm_mips_build_exit(void *addr);
 
 /* FPU/MSA context management */
 void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu);
@@ -622,11 +555,11 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu);
 void kvm_lose_fpu(struct kvm_vcpu *vcpu);
 
 /* TLB handling */
-uint32_t kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
 
-uint32_t kvm_get_user_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 
-uint32_t kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
+u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
                                           struct kvm_vcpu *vcpu);
@@ -635,22 +568,24 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
                                              struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-                                               struct kvm_mips_tlb *tlb,
-                                               unsigned long *hpa0,
-                                               unsigned long *hpa1);
+                                               struct kvm_mips_tlb *tlb);
 
-extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
-                                                    uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
+                                                    u32 *opc,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause,
-                                                   uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
+                                                   u32 *opc,
                                                    struct kvm_run *run,
                                                    struct kvm_vcpu *vcpu);
 
 extern void kvm_mips_dump_host_tlbs(void);
 extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
+extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
+                                  unsigned long entrylo0,
+                                  unsigned long entrylo1,
+                                  int flush_dcache_mask);
 extern void kvm_mips_flush_host_tlb(int skip_kseg0);
 extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
 
@@ -667,90 +602,90 @@ extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
 
 /* Emulation */
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu);
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause);
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
 
-extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause,
-                                                  uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
+                                                  u32 *opc,
                                                   struct kvm_run *run,
                                                   struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
-                                                     uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
+                                                     u32 *opc,
                                                      struct kvm_run *run,
                                                      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
-                                                        uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
+                                                        u32 *opc,
                                                         struct kvm_run *run,
                                                         struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
-                                                       uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
+                                                       u32 *opc,
                                                        struct kvm_run *run,
                                                        struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
-                                                        uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
+                                                        u32 *opc,
                                                         struct kvm_run *run,
                                                         struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
-                                                       uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
+                                                       u32 *opc,
                                                        struct kvm_run *run,
                                                        struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
-                                                    uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
+                                                    u32 *opc,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
-                                                     uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
+                                                     u32 *opc,
                                                      struct kvm_run *run,
                                                      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_handle_ri(unsigned long cause,
-                                               uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_ri(u32 cause,
+                                               u32 *opc,
                                                struct kvm_run *run,
                                                struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
-                                                    uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
+                                                    u32 *opc,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
-                                                    uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
+                                                    u32 *opc,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
-                                                      uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
+                                                      u32 *opc,
                                                       struct kvm_run *run,
                                                       struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
-                                                        uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
+                                                        u32 *opc,
                                                         struct kvm_run *run,
                                                         struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
-                                                     uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
+                                                     u32 *opc,
                                                      struct kvm_run *run,
                                                      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
-                                                        uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
+                                                        u32 *opc,
                                                         struct kvm_run *run,
                                                         struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
                                                         struct kvm_run *run);
 
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack);
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu);
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count);
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack);
 void kvm_mips_init_count(struct kvm_vcpu *vcpu);
 int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
 int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
@@ -759,27 +694,27 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
 void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
 
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu);
 
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst,
-                                            uint32_t *opc,
-                                            uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+                                            u32 *opc,
+                                            u32 cause,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst,
-                                          uint32_t *opc,
-                                          uint32_t cause,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+                                          u32 *opc,
+                                          u32 cause,
                                           struct kvm_run *run,
                                           struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_store(uint32_t inst,
-                                            uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+                                            u32 cause,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_load(uint32_t inst,
-                                           uint32_t cause,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+                                           u32 cause,
                                            struct kvm_run *run,
                                            struct kvm_vcpu *vcpu);
 
@@ -789,13 +724,13 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
 
 /* Dynamic binary translation */
-extern int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
-                                     struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_cache_index(union mips_instruction inst,
+                                     u32 *opc, struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
                                   struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
                               struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
                               struct kvm_vcpu *vcpu);
 
 /* Misc */
index d68e685..bd8b9bb 100644 (file)
@@ -55,7 +55,7 @@
 #define cpu_has_mipsmt         0
 #define cpu_has_vint           0
 #define cpu_has_veic           0
-#define cpu_hwrena_impl_bits   0xc0000000
+#define cpu_hwrena_impl_bits   (MIPS_HWRENA_IMPL1 | MIPS_HWRENA_IMPL2)
 #define cpu_has_wsbh            1
 
 #define cpu_has_rixi           (cpu_data[0].cputype != CPU_CAVIUM_OCTEON)
index e1ca65c..def9d8d 100644 (file)
@@ -53,7 +53,7 @@
 #define CP0_SEGCTL2 $5, 4
 #define CP0_WIRED $6
 #define CP0_INFO $7
-#define CP0_HWRENA $7, 0
+#define CP0_HWRENA $7
 #define CP0_BADVADDR $8
 #define CP0_BADINSTR $8, 1
 #define CP0_COUNT $9
 #define TX49_CONF_CWFON                (_ULCAST_(1) << 27)
 
 /* Bits specific to the MIPS32/64 PRA. */
+#define MIPS_CONF_VI           (_ULCAST_(1) <<  3)
 #define MIPS_CONF_MT           (_ULCAST_(7) <<  7)
 #define MIPS_CONF_MT_TLB       (_ULCAST_(1) <<  7)
 #define MIPS_CONF_MT_FTLB      (_ULCAST_(4) <<  7)
 #define MIPS_CDMMBASE_ADDR_SHIFT 11
 #define MIPS_CDMMBASE_ADDR_START 15
 
+/* RDHWR register numbers */
+#define MIPS_HWR_CPUNUM                0       /* CPU number */
+#define MIPS_HWR_SYNCISTEP     1       /* SYNCI step size */
+#define MIPS_HWR_CC            2       /* Cycle counter */
+#define MIPS_HWR_CCRES         3       /* Cycle counter resolution */
+#define MIPS_HWR_ULR           29      /* UserLocal */
+#define MIPS_HWR_IMPL1         30      /* Implementation dependent */
+#define MIPS_HWR_IMPL2         31      /* Implementation dependent */
+
+/* Bits in HWREna register */
+#define MIPS_HWRENA_CPUNUM     (_ULCAST_(1) << MIPS_HWR_CPUNUM)
+#define MIPS_HWRENA_SYNCISTEP  (_ULCAST_(1) << MIPS_HWR_SYNCISTEP)
+#define MIPS_HWRENA_CC         (_ULCAST_(1) << MIPS_HWR_CC)
+#define MIPS_HWRENA_CCRES      (_ULCAST_(1) << MIPS_HWR_CCRES)
+#define MIPS_HWRENA_ULR                (_ULCAST_(1) << MIPS_HWR_ULR)
+#define MIPS_HWRENA_IMPL1      (_ULCAST_(1) << MIPS_HWR_IMPL1)
+#define MIPS_HWRENA_IMPL2      (_ULCAST_(1) << MIPS_HWR_IMPL2)
+
 /*
  * Bitfields in the TX39 family CP0 Configuration Register 3
  */
index 86b239d..9b63cd4 100644 (file)
@@ -80,16 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
 
-static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
-               const struct resource *rsrc, resource_size_t *start,
-               resource_size_t *end)
-{
-       phys_addr_t size = resource_size(rsrc);
-
-       *start = fixup_bigphys_addr(rsrc->start, size);
-       *end = rsrc->start + size;
-}
-
 /*
  * Dynamic DMA mapping stuff.
  * MIPS has everything mapped statically.
index d7bfdeb..4f5279a 100644 (file)
@@ -21,6 +21,7 @@ extern void *set_vi_handler(int n, vi_handler_t addr);
 
 extern void *set_except_vector(int n, void *addr);
 extern unsigned long ebase;
+extern unsigned int hwrena;
 extern void per_cpu_trap_init(bool);
 extern void cpu_cache_init(void);
 
index b6ecfee..f7929f6 100644 (file)
@@ -104,8 +104,13 @@ Ip_u1s2(_bltz);
 Ip_u1s2(_bltzl);
 Ip_u1u2s3(_bne);
 Ip_u2s3u1(_cache);
+Ip_u1u2(_cfc1);
+Ip_u2u1(_cfcmsa);
+Ip_u1u2(_ctc1);
+Ip_u2u1(_ctcmsa);
 Ip_u2u1s3(_daddiu);
 Ip_u3u1u2(_daddu);
+Ip_u1(_di);
 Ip_u2u1msbu3(_dins);
 Ip_u2u1msbu3(_dinsm);
 Ip_u1u2(_divu);
@@ -141,6 +146,8 @@ Ip_u1(_mfhi);
 Ip_u1(_mflo);
 Ip_u1u2u3(_mtc0);
 Ip_u1u2u3(_mthc0);
+Ip_u1(_mthi);
+Ip_u1(_mtlo);
 Ip_u3u1u2(_mul);
 Ip_u3u1u2(_or);
 Ip_u2u1u3(_ori);
index 8051f9a..77429d1 100644 (file)
 enum major_op {
        spec_op, bcond_op, j_op, jal_op,
        beq_op, bne_op, blez_op, bgtz_op,
-       addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op,
+       addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op,
        andi_op, ori_op, xori_op, lui_op,
        cop0_op, cop1_op, cop2_op, cop1x_op,
        beql_op, bnel_op, blezl_op, bgtzl_op,
-       daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op,
+       daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op,
        spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op,
        lb_op, lh_op, lwl_op, lw_op,
        lbu_op, lhu_op, lwr_op, lwu_op,
        sb_op, sh_op, swl_op, sw_op,
        sdl_op, sdr_op, swr_op, cache_op,
        ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op,
-       lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op,
+       lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op,
        sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op,
-       scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op
+       scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op
 };
 
 /*
@@ -92,6 +92,50 @@ enum spec3_op {
        rdhwr_op  = 0x3b
 };
 
+/*
+ * Bits 10-6 minor opcode for r6 spec mult/div encodings
+ */
+enum mult_op {
+       mult_mult_op = 0x0,
+       mult_mul_op = 0x2,
+       mult_muh_op = 0x3,
+};
+enum multu_op {
+       multu_multu_op = 0x0,
+       multu_mulu_op = 0x2,
+       multu_muhu_op = 0x3,
+};
+enum div_op {
+       div_div_op = 0x0,
+       div_div6_op = 0x2,
+       div_mod_op = 0x3,
+};
+enum divu_op {
+       divu_divu_op = 0x0,
+       divu_divu6_op = 0x2,
+       divu_modu_op = 0x3,
+};
+enum dmult_op {
+       dmult_dmult_op = 0x0,
+       dmult_dmul_op = 0x2,
+       dmult_dmuh_op = 0x3,
+};
+enum dmultu_op {
+       dmultu_dmultu_op = 0x0,
+       dmultu_dmulu_op = 0x2,
+       dmultu_dmuhu_op = 0x3,
+};
+enum ddiv_op {
+       ddiv_ddiv_op = 0x0,
+       ddiv_ddiv6_op = 0x2,
+       ddiv_dmod_op = 0x3,
+};
+enum ddivu_op {
+       ddivu_ddivu_op = 0x0,
+       ddivu_ddivu6_op = 0x2,
+       ddivu_dmodu_op = 0x3,
+};
+
 /*
  * rt field of bcond opcodes.
  */
@@ -103,7 +147,7 @@ enum rt_op {
        bltzal_op, bgezal_op, bltzall_op, bgezall_op,
        rt_op_0x14, rt_op_0x15, rt_op_0x16, rt_op_0x17,
        rt_op_0x18, rt_op_0x19, rt_op_0x1a, rt_op_0x1b,
-       bposge32_op, rt_op_0x1d, rt_op_0x1e, rt_op_0x1f
+       bposge32_op, rt_op_0x1d, rt_op_0x1e, synci_op
 };
 
 /*
@@ -237,6 +281,21 @@ enum bshfl_func {
        seh_op  = 0x18,
 };
 
+/*
+ * MSA minor opcodes.
+ */
+enum msa_func {
+       msa_elm_op = 0x19,
+};
+
+/*
+ * MSA ELM opcodes.
+ */
+enum msa_elm {
+       msa_ctc_op = 0x3e,
+       msa_cfc_op = 0x7e,
+};
+
 /*
  * func field for MSA MI10 format.
  */
@@ -264,7 +323,7 @@ enum mm_major_op {
        mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op,
        mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op,
        mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op,
-       mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op,
+       mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op,
        mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op,
        mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op,
        mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op,
@@ -360,7 +419,10 @@ enum mm_32axf_minor_op {
        mm_mflo32_op = 0x075,
        mm_jalrhb_op = 0x07c,
        mm_tlbwi_op = 0x08d,
+       mm_mthi32_op = 0x0b5,
        mm_tlbwr_op = 0x0cd,
+       mm_mtlo32_op = 0x0f5,
+       mm_di_op = 0x11d,
        mm_jalrs_op = 0x13c,
        mm_jalrshb_op = 0x17c,
        mm_sync_op = 0x1ad,
@@ -478,6 +540,13 @@ enum mm_32f_73_minor_op {
        mm_fcvts1_op = 0xed,
 };
 
+/*
+ * (microMIPS) POOL32S minor opcodes.
+ */
+enum mm_32s_minor_op {
+       mm_32s_elm_op = 0x16,
+};
+
 /*
  * (microMIPS) POOL16C minor opcodes.
  */
@@ -586,6 +655,36 @@ struct r_format {                  /* Register format */
        ;))))))
 };
 
+struct c0r_format {                    /* C0 register format */
+       __BITFIELD_FIELD(unsigned int opcode : 6,
+       __BITFIELD_FIELD(unsigned int rs : 5,
+       __BITFIELD_FIELD(unsigned int rt : 5,
+       __BITFIELD_FIELD(unsigned int rd : 5,
+       __BITFIELD_FIELD(unsigned int z: 8,
+       __BITFIELD_FIELD(unsigned int sel : 3,
+       ;))))))
+};
+
+struct mfmc0_format {                  /* MFMC0 register format */
+       __BITFIELD_FIELD(unsigned int opcode : 6,
+       __BITFIELD_FIELD(unsigned int rs : 5,
+       __BITFIELD_FIELD(unsigned int rt : 5,
+       __BITFIELD_FIELD(unsigned int rd : 5,
+       __BITFIELD_FIELD(unsigned int re : 5,
+       __BITFIELD_FIELD(unsigned int sc : 1,
+       __BITFIELD_FIELD(unsigned int : 2,
+       __BITFIELD_FIELD(unsigned int sel : 3,
+       ;))))))))
+};
+
+struct co_format {                     /* C0 CO format */
+       __BITFIELD_FIELD(unsigned int opcode : 6,
+       __BITFIELD_FIELD(unsigned int co : 1,
+       __BITFIELD_FIELD(unsigned int code : 19,
+       __BITFIELD_FIELD(unsigned int func : 6,
+       ;))))
+};
+
 struct p_format {              /* Performance counter format (R10000) */
        __BITFIELD_FIELD(unsigned int opcode : 6,
        __BITFIELD_FIELD(unsigned int rs : 5,
@@ -937,6 +1036,9 @@ union mips_instruction {
        struct u_format u_format;
        struct c_format c_format;
        struct r_format r_format;
+       struct c0r_format c0r_format;
+       struct mfmc0_format mfmc0_format;
+       struct co_format co_format;
        struct p_format p_format;
        struct f_format f_format;
        struct ma_format ma_format;
index 1ea973b..fae2f94 100644 (file)
@@ -339,71 +339,9 @@ void output_pm_defines(void)
 }
 #endif
 
-void output_cpuinfo_defines(void)
-{
-       COMMENT(" MIPS cpuinfo offsets. ");
-       DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips));
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-       OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask);
-#endif
-}
-
 void output_kvm_defines(void)
 {
        COMMENT(" KVM/MIPS Specfic offsets. ");
-       DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch));
-       OFFSET(VCPU_RUN, kvm_vcpu, run);
-       OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch);
-
-       OFFSET(VCPU_HOST_EBASE, kvm_vcpu_arch, host_ebase);
-       OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase);
-
-       OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack);
-       OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp);
-
-       OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr);
-       OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause);
-       OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
-       OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi);
-
-       OFFSET(VCPU_GUEST_INST, kvm_vcpu_arch, guest_inst);
-
-       OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
-       OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
-       OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]);
-       OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]);
-       OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]);
-       OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]);
-       OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]);
-       OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]);
-       OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]);
-       OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]);
-       OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]);
-       OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]);
-       OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]);
-       OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]);
-       OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]);
-       OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]);
-       OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]);
-       OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]);
-       OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]);
-       OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]);
-       OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]);
-       OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]);
-       OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]);
-       OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]);
-       OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]);
-       OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]);
-       OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]);
-       OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]);
-       OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]);
-       OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]);
-       OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]);
-       OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]);
-       OFFSET(VCPU_LO, kvm_vcpu_arch, lo);
-       OFFSET(VCPU_HI, kvm_vcpu_arch, hi);
-       OFFSET(VCPU_PC, kvm_vcpu_arch, pc);
-       BLANK();
 
        OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]);
        OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]);
@@ -441,14 +379,6 @@ void output_kvm_defines(void)
        OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31);
        OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr);
        BLANK();
-
-       OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0);
-       OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid);
-       OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid);
-
-       OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]);
-       OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]);
-       BLANK();
 }
 
 #ifdef CONFIG_MIPS_CPS
index 6dc3f1f..46c227f 100644 (file)
@@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
                epc += 4 + (insn.i_format.simmediate << 2);
                regs->cp0_epc = epc;
                break;
-       case beqzcjic_op:
+       case pop66_op:
                if (!cpu_has_mips_r6) {
                        ret = -SIGILL;
                        break;
@@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
                /* Compact branch: BEQZC || JIC */
                regs->cp0_epc += 8;
                break;
-       case bnezcjialc_op:
+       case pop76_op:
                if (!cpu_has_mips_r6) {
                        ret = -SIGILL;
                        break;
@@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
                regs->cp0_epc += 8;
                break;
 #endif
-       case cbcond0_op:
-       case cbcond1_op:
+       case pop10_op:
+       case pop30_op:
                /* Only valid for MIPS R6 */
                if (!cpu_has_mips_r6) {
                        ret = -SIGILL;
index 4a1712b..6fb4704 100644 (file)
@@ -619,17 +619,17 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
        perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
                        1, regs, 0);
        switch (rd) {
-       case 0:         /* CPU number */
+       case MIPS_HWR_CPUNUM:           /* CPU number */
                regs->regs[rt] = smp_processor_id();
                return 0;
-       case 1:         /* SYNCI length */
+       case MIPS_HWR_SYNCISTEP:        /* SYNCI length */
                regs->regs[rt] = min(current_cpu_data.dcache.linesz,
                                     current_cpu_data.icache.linesz);
                return 0;
-       case 2:         /* Read count register */
+       case MIPS_HWR_CC:               /* Read count register */
                regs->regs[rt] = read_c0_count();
                return 0;
-       case 3:         /* Count register resolution */
+       case MIPS_HWR_CCRES:            /* Count register resolution */
                switch (current_cpu_type()) {
                case CPU_20KC:
                case CPU_25KF:
@@ -639,7 +639,7 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
                        regs->regs[rt] = 2;
                }
                return 0;
-       case 29:
+       case MIPS_HWR_ULR:              /* Read UserLocal register */
                regs->regs[rt] = ti->tp_value;
                return 0;
        default:
@@ -1859,6 +1859,7 @@ void __noreturn nmi_exception_handler(struct pt_regs *regs)
 #define VECTORSPACING 0x100    /* for EI/VI mode */
 
 unsigned long ebase;
+EXPORT_SYMBOL_GPL(ebase);
 unsigned long exception_handlers[32];
 unsigned long vi_handlers[64];
 
@@ -2063,16 +2064,22 @@ static void configure_status(void)
                         status_set);
 }
 
+unsigned int hwrena;
+EXPORT_SYMBOL_GPL(hwrena);
+
 /* configure HWRENA register */
 static void configure_hwrena(void)
 {
-       unsigned int hwrena = cpu_hwrena_impl_bits;
+       hwrena = cpu_hwrena_impl_bits;
 
        if (cpu_has_mips_r2_r6)
-               hwrena |= 0x0000000f;
+               hwrena |= MIPS_HWRENA_CPUNUM |
+                         MIPS_HWRENA_SYNCISTEP |
+                         MIPS_HWRENA_CC |
+                         MIPS_HWRENA_CCRES;
 
        if (!noulri && cpu_has_userlocal)
-               hwrena |= (1 << 29);
+               hwrena |= MIPS_HWRENA_ULR;
 
        if (hwrena)
                write_c0_hwrena(hwrena);
index 2ae1282..7c56d6b 100644 (file)
@@ -17,6 +17,7 @@ if VIRTUALIZATION
 config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
        depends on HAVE_KVM
+       select EXPORT_UASM
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select KVM_MMIO
index 637ebbe..847429d 100644 (file)
@@ -7,9 +7,10 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
 
 common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 
-kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \
+kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
            interrupt.o stats.o commpage.o \
            dyntrans.o trap_emul.o fpu.o
+kvm-objs += mmu.o
 
 obj-$(CONFIG_KVM)      += kvm.o
 obj-y                  += callback.o tlb.o
index 2d6e976..a36b77e 100644 (file)
@@ -4,7 +4,7 @@
  * for more details.
  *
  * commpage, currently used for Virtual COP0 registers.
- * Mapped into the guest kernel @ 0x0.
+ * Mapped into the guest kernel @ KVM_GUEST_COMMPAGE_ADDR.
  *
  * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
  * Authors: Sanjay Lal <sanjayl@kymasys.com>
index f1527a4..d280894 100644 (file)
@@ -11,6 +11,7 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/highmem.h>
 #include <linux/kvm_host.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 
 #include "commpage.h"
 
-#define SYNCI_TEMPLATE  0x041f0000
-#define SYNCI_BASE(x)   (((x) >> 21) & 0x1f)
-#define SYNCI_OFFSET    ((x) & 0xffff)
+/**
+ * kvm_mips_trans_replace() - Replace trapping instruction in guest memory.
+ * @vcpu:      Virtual CPU.
+ * @opc:       PC of instruction to replace.
+ * @replace:   Instruction to write
+ */
+static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
+                                 union mips_instruction replace)
+{
+       unsigned long paddr, flags;
+       void *vaddr;
+
+       if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) {
+               paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
+                                                           (unsigned long)opc);
+               vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+               vaddr += paddr & ~PAGE_MASK;
+               memcpy(vaddr, (void *)&replace, sizeof(u32));
+               local_flush_icache_range((unsigned long)vaddr,
+                                        (unsigned long)vaddr + 32);
+               kunmap_atomic(vaddr);
+       } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
+               local_irq_save(flags);
+               memcpy((void *)opc, (void *)&replace, sizeof(u32));
+               local_flush_icache_range((unsigned long)opc,
+                                        (unsigned long)opc + 32);
+               local_irq_restore(flags);
+       } else {
+               kvm_err("%s: Invalid address: %p\n", __func__, opc);
+               return -EFAULT;
+       }
 
-#define LW_TEMPLATE     0x8c000000
-#define CLEAR_TEMPLATE  0x00000020
-#define SW_TEMPLATE     0xac000000
+       return 0;
+}
 
-int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_index(union mips_instruction inst, u32 *opc,
                               struct kvm_vcpu *vcpu)
 {
-       int result = 0;
-       unsigned long kseg0_opc;
-       uint32_t synci_inst = 0x0;
+       union mips_instruction nop_inst = { 0 };
 
        /* Replace the CACHE instruction, with a NOP */
-       kseg0_opc =
-           CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-                      (vcpu, (unsigned long) opc));
-       memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
-       local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
-       return result;
+       return kvm_mips_trans_replace(vcpu, opc, nop_inst);
 }
 
 /*
  * Address based CACHE instructions are transformed into synci(s). A little
  * heavy for just D-cache invalidates, but avoids an expensive trap
  */
-int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
                            struct kvm_vcpu *vcpu)
 {
-       int result = 0;
-       unsigned long kseg0_opc;
-       uint32_t synci_inst = SYNCI_TEMPLATE, base, offset;
-
-       base = (inst >> 21) & 0x1f;
-       offset = inst & 0xffff;
-       synci_inst |= (base << 21);
-       synci_inst |= offset;
-
-       kseg0_opc =
-           CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-                      (vcpu, (unsigned long) opc));
-       memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
-       local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
-       return result;
+       union mips_instruction synci_inst = { 0 };
+
+       synci_inst.i_format.opcode = bcond_op;
+       synci_inst.i_format.rs = inst.i_format.rs;
+       synci_inst.i_format.rt = synci_op;
+       if (cpu_has_mips_r6)
+               synci_inst.i_format.simmediate = inst.spec3_format.simmediate;
+       else
+               synci_inst.i_format.simmediate = inst.i_format.simmediate;
+
+       return kvm_mips_trans_replace(vcpu, opc, synci_inst);
 }
 
-int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
+                       struct kvm_vcpu *vcpu)
 {
-       int32_t rt, rd, sel;
-       uint32_t mfc0_inst;
-       unsigned long kseg0_opc, flags;
-
-       rt = (inst >> 16) & 0x1f;
-       rd = (inst >> 11) & 0x1f;
-       sel = inst & 0x7;
+       union mips_instruction mfc0_inst = { 0 };
+       u32 rd, sel;
 
-       if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
-               mfc0_inst = CLEAR_TEMPLATE;
-               mfc0_inst |= ((rt & 0x1f) << 16);
-       } else {
-               mfc0_inst = LW_TEMPLATE;
-               mfc0_inst |= ((rt & 0x1f) << 16);
-               mfc0_inst |= offsetof(struct kvm_mips_commpage,
-                                     cop0.reg[rd][sel]);
-       }
+       rd = inst.c0r_format.rd;
+       sel = inst.c0r_format.sel;
 
-       if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-               kseg0_opc =
-                   CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-                              (vcpu, (unsigned long) opc));
-               memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t));
-               local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-       } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-               local_irq_save(flags);
-               memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t));
-               local_flush_icache_range((unsigned long)opc,
-                                        (unsigned long)opc + 32);
-               local_irq_restore(flags);
+       if (rd == MIPS_CP0_ERRCTL && sel == 0) {
+               mfc0_inst.r_format.opcode = spec_op;
+               mfc0_inst.r_format.rd = inst.c0r_format.rt;
+               mfc0_inst.r_format.func = add_op;
        } else {
-               kvm_err("%s: Invalid address: %p\n", __func__, opc);
-               return -EFAULT;
+               mfc0_inst.i_format.opcode = lw_op;
+               mfc0_inst.i_format.rt = inst.c0r_format.rt;
+               mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
+                       offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+               if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+                       mfc0_inst.i_format.simmediate |= 4;
+#endif
        }
 
-       return 0;
+       return kvm_mips_trans_replace(vcpu, opc, mfc0_inst);
 }
 
-int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
+                       struct kvm_vcpu *vcpu)
 {
-       int32_t rt, rd, sel;
-       uint32_t mtc0_inst = SW_TEMPLATE;
-       unsigned long kseg0_opc, flags;
-
-       rt = (inst >> 16) & 0x1f;
-       rd = (inst >> 11) & 0x1f;
-       sel = inst & 0x7;
-
-       mtc0_inst |= ((rt & 0x1f) << 16);
-       mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
-
-       if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-               kseg0_opc =
-                   CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-                              (vcpu, (unsigned long) opc));
-               memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t));
-               local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-       } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-               local_irq_save(flags);
-               memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t));
-               local_flush_icache_range((unsigned long)opc,
-                                        (unsigned long)opc + 32);
-               local_irq_restore(flags);
-       } else {
-               kvm_err("%s: Invalid address: %p\n", __func__, opc);
-               return -EFAULT;
-       }
-
-       return 0;
+       union mips_instruction mtc0_inst = { 0 };
+       u32 rd, sel;
+
+       rd = inst.c0r_format.rd;
+       sel = inst.c0r_format.sel;
+
+       mtc0_inst.i_format.opcode = sw_op;
+       mtc0_inst.i_format.rt = inst.c0r_format.rt;
+       mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
+               offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+       if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+               mtc0_inst.i_format.simmediate |= 4;
+#endif
+
+       return kvm_mips_trans_replace(vcpu, opc, mtc0_inst);
 }
index 645c8a1..6eb52b9 100644 (file)
@@ -52,7 +52,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
                goto unaligned;
 
        /* Read the instruction */
-       insn.word = kvm_get_inst((uint32_t *) epc, vcpu);
+       insn.word = kvm_get_inst((u32 *) epc, vcpu);
 
        if (insn.word == KVM_INVALID_INST)
                return KVM_INVALID_INST;
@@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
                nextpc = epc;
                break;
 
-       case blez_op:           /* not really i_format */
-       case blezl_op:
-               /* rt field assumed to be zero */
+       case blez_op:   /* POP06 */
+#ifndef CONFIG_CPU_MIPSR6
+       case blezl_op:  /* removed in R6 */
+#endif
+               if (insn.i_format.rt != 0)
+                       goto compact_branch;
                if ((long)arch->gprs[insn.i_format.rs] <= 0)
                        epc = epc + 4 + (insn.i_format.simmediate << 2);
                else
@@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
                nextpc = epc;
                break;
 
-       case bgtz_op:
-       case bgtzl_op:
-               /* rt field assumed to be zero */
+       case bgtz_op:   /* POP07 */
+#ifndef CONFIG_CPU_MIPSR6
+       case bgtzl_op:  /* removed in R6 */
+#endif
+               if (insn.i_format.rt != 0)
+                       goto compact_branch;
                if ((long)arch->gprs[insn.i_format.rs] > 0)
                        epc = epc + 4 + (insn.i_format.simmediate << 2);
                else
@@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
        case cop1_op:
                kvm_err("%s: unsupported cop1_op\n", __func__);
                break;
+
+#ifdef CONFIG_CPU_MIPSR6
+       /* R6 added the following compact branches with forbidden slots */
+       case blezl_op:  /* POP26 */
+       case bgtzl_op:  /* POP27 */
+               /* only rt == 0 isn't compact branch */
+               if (insn.i_format.rt != 0)
+                       goto compact_branch;
+               break;
+       case pop10_op:
+       case pop30_op:
+               /* only rs == rt == 0 is reserved, rest are compact branches */
+               if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
+                       goto compact_branch;
+               break;
+       case pop66_op:
+       case pop76_op:
+               /* only rs == 0 isn't compact branch */
+               if (insn.i_format.rs != 0)
+                       goto compact_branch;
+               break;
+compact_branch:
+               /*
+                * If we've hit an exception on the forbidden slot, then
+                * the branch must not have been taken.
+                */
+               epc += 8;
+               nextpc = epc;
+               break;
+#else
+compact_branch:
+               /* Compact branches not supported before R6 */
+               break;
+#endif
        }
 
        return nextpc;
@@ -198,7 +238,7 @@ sigill:
        return nextpc;
 }
 
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause)
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
 {
        unsigned long branch_pc;
        enum emulation_result er = EMULATE_DONE;
@@ -243,7 +283,7 @@ static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
  *
  * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
  */
-static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
 {
        s64 now_ns, periods;
        u64 delta;
@@ -300,11 +340,11 @@ static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu)
  *
  * Returns:    The current value of the guest CP0_Count register.
  */
-static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        ktime_t expires, threshold;
-       uint32_t count, compare;
+       u32 count, compare;
        int running;
 
        /* Calculate the biased and scaled guest CP0_Count */
@@ -315,7 +355,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
         * Find whether CP0_Count has reached the closest timer interrupt. If
         * not, we shouldn't inject it.
         */
-       if ((int32_t)(count - compare) < 0)
+       if ((s32)(count - compare) < 0)
                return count;
 
        /*
@@ -360,7 +400,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
  *
  * Returns:    The current guest CP0_Count value.
  */
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
 
@@ -387,8 +427,7 @@ uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
  *
  * Returns:    The ktime at the point of freeze.
  */
-static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
-                                      uint32_t *count)
+static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
 {
        ktime_t now;
 
@@ -419,16 +458,16 @@ static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
  * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
  */
 static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
-                                   ktime_t now, uint32_t count)
+                                   ktime_t now, u32 count)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t compare;
+       u32 compare;
        u64 delta;
        ktime_t expire;
 
        /* Calculate timeout (wrap 0 to 2^32) */
        compare = kvm_read_c0_guest_compare(cop0);
-       delta = (u64)(uint32_t)(compare - count - 1) + 1;
+       delta = (u64)(u32)(compare - count - 1) + 1;
        delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
        expire = ktime_add_ns(now, delta);
 
@@ -444,7 +483,7 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
  *
  * Sets the CP0_Count value and updates the timer accordingly.
  */
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count)
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        ktime_t now;
@@ -538,13 +577,13 @@ int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz)
  * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure
  * any pending timer interrupt is preserved.
  */
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        int dc;
        u32 old_compare = kvm_read_c0_guest_compare(cop0);
        ktime_t now;
-       uint32_t count;
+       u32 count;
 
        /* if unchanged, must just be an ack */
        if (old_compare == compare) {
@@ -585,7 +624,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
 static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t count;
+       u32 count;
        ktime_t now;
 
        /* Stop hrtimer */
@@ -632,7 +671,7 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu)
 void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t count;
+       u32 count;
 
        kvm_clear_c0_guest_cause(cop0, CAUSEF_DC);
 
@@ -661,7 +700,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
        s64 changed = count_ctl ^ vcpu->arch.count_ctl;
        s64 delta;
        ktime_t expire, now;
-       uint32_t count, compare;
+       u32 count, compare;
 
        /* Only allow defined bits to be changed */
        if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC))
@@ -687,7 +726,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
                         */
                        count = kvm_read_c0_guest_count(cop0);
                        compare = kvm_read_c0_guest_compare(cop0);
-                       delta = (u64)(uint32_t)(compare - count - 1) + 1;
+                       delta = (u64)(u32)(compare - count - 1) + 1;
                        delta = div_u64(delta * NSEC_PER_SEC,
                                        vcpu->arch.count_hz);
                        expire = ktime_add_ns(vcpu->arch.count_resume, delta);
@@ -776,7 +815,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
                  vcpu->arch.pending_exceptions);
 
        ++vcpu->stat.wait_exits;
-       trace_kvm_exit(vcpu, WAIT_EXITS);
+       trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT);
        if (!vcpu->arch.pending_exceptions) {
                vcpu->arch.wait = 1;
                kvm_vcpu_block(vcpu);
@@ -801,9 +840,9 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t pc = vcpu->arch.pc;
+       unsigned long pc = vcpu->arch.pc;
 
-       kvm_err("[%#x] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
+       kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
        return EMULATE_FAIL;
 }
 
@@ -813,11 +852,11 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        int index = kvm_read_c0_guest_index(cop0);
        struct kvm_mips_tlb *tlb = NULL;
-       uint32_t pc = vcpu->arch.pc;
+       unsigned long pc = vcpu->arch.pc;
 
        if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) {
                kvm_debug("%s: illegal index: %d\n", __func__, index);
-               kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+               kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
                          pc, index, kvm_read_c0_guest_entryhi(cop0),
                          kvm_read_c0_guest_entrylo0(cop0),
                          kvm_read_c0_guest_entrylo1(cop0),
@@ -834,10 +873,10 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
 
        tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
        tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
-       tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
-       tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+       tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+       tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
 
-       kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+       kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
                  pc, index, kvm_read_c0_guest_entryhi(cop0),
                  kvm_read_c0_guest_entrylo0(cop0),
                  kvm_read_c0_guest_entrylo1(cop0),
@@ -851,7 +890,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_mips_tlb *tlb = NULL;
-       uint32_t pc = vcpu->arch.pc;
+       unsigned long pc = vcpu->arch.pc;
        int index;
 
        get_random_bytes(&index, sizeof(index));
@@ -867,10 +906,10 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 
        tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
        tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
-       tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
-       tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+       tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+       tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
 
-       kvm_debug("[%#x] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
+       kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
                  pc, index, kvm_read_c0_guest_entryhi(cop0),
                  kvm_read_c0_guest_entrylo0(cop0),
                  kvm_read_c0_guest_entrylo1(cop0));
@@ -882,14 +921,14 @@ enum emulation_result kvm_mips_emul_tlbp(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        long entryhi = kvm_read_c0_guest_entryhi(cop0);
-       uint32_t pc = vcpu->arch.pc;
+       unsigned long pc = vcpu->arch.pc;
        int index = -1;
 
        index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
 
        kvm_write_c0_guest_index(cop0, index);
 
-       kvm_debug("[%#x] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
+       kvm_debug("[%#lx] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
                  index);
 
        return EMULATE_DONE;
@@ -922,8 +961,8 @@ unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu)
  */
 unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
 {
-       /* Config4 is optional */
-       unsigned int mask = MIPS_CONF_M;
+       /* Config4 and ULRI are optional */
+       unsigned int mask = MIPS_CONF_M | MIPS_CONF3_ULRI;
 
        /* Permit MSA to be present if MSA is supported */
        if (kvm_mips_guest_can_have_msa(&vcpu->arch))
@@ -942,7 +981,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
 {
        /* Config5 is optional */
-       return MIPS_CONF_M;
+       unsigned int mask = MIPS_CONF_M;
+
+       /* KScrExist */
+       mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16;
+
+       return mask;
 }
 
 /**
@@ -973,14 +1017,14 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu)
        return mask;
 }
 
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
-                                          uint32_t cause, struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+                                          u32 *opc, u32 cause,
+                                          struct kvm_run *run,
                                           struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        enum emulation_result er = EMULATE_DONE;
-       int32_t rt, rd, copz, sel, co_bit, op;
-       uint32_t pc = vcpu->arch.pc;
+       u32 rt, rd, sel;
        unsigned long curr_pc;
 
        /*
@@ -992,16 +1036,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
        if (er == EMULATE_FAIL)
                return er;
 
-       copz = (inst >> 21) & 0x1f;
-       rt = (inst >> 16) & 0x1f;
-       rd = (inst >> 11) & 0x1f;
-       sel = inst & 0x7;
-       co_bit = (inst >> 25) & 1;
-
-       if (co_bit) {
-               op = (inst) & 0xff;
-
-               switch (op) {
+       if (inst.co_format.co) {
+               switch (inst.co_format.func) {
                case tlbr_op:   /*  Read indexed TLB entry  */
                        er = kvm_mips_emul_tlbr(vcpu);
                        break;
@@ -1020,47 +1056,58 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                case eret_op:
                        er = kvm_mips_emul_eret(vcpu);
                        goto dont_update_pc;
-                       break;
                case wait_op:
                        er = kvm_mips_emul_wait(vcpu);
                        break;
                }
        } else {
-               switch (copz) {
+               rt = inst.c0r_format.rt;
+               rd = inst.c0r_format.rd;
+               sel = inst.c0r_format.sel;
+
+               switch (inst.c0r_format.rs) {
                case mfc_op:
 #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
                        cop0->stat[rd][sel]++;
 #endif
                        /* Get reg */
                        if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
-                               vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu);
+                               vcpu->arch.gprs[rt] =
+                                   (s32)kvm_mips_read_count(vcpu);
                        } else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
                                vcpu->arch.gprs[rt] = 0x0;
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
                                kvm_mips_trans_mfc0(inst, opc, vcpu);
 #endif
                        } else {
-                               vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+                               vcpu->arch.gprs[rt] = (s32)cop0->reg[rd][sel];
 
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
                                kvm_mips_trans_mfc0(inst, opc, vcpu);
 #endif
                        }
 
-                       kvm_debug
-                           ("[%#x] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n",
-                            pc, rd, sel, rt, vcpu->arch.gprs[rt]);
-
+                       trace_kvm_hwr(vcpu, KVM_TRACE_MFC0,
+                                     KVM_TRACE_COP0(rd, sel),
+                                     vcpu->arch.gprs[rt]);
                        break;
 
                case dmfc_op:
                        vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+
+                       trace_kvm_hwr(vcpu, KVM_TRACE_DMFC0,
+                                     KVM_TRACE_COP0(rd, sel),
+                                     vcpu->arch.gprs[rt]);
                        break;
 
                case mtc_op:
 #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
                        cop0->stat[rd][sel]++;
 #endif
+                       trace_kvm_hwr(vcpu, KVM_TRACE_MTC0,
+                                     KVM_TRACE_COP0(rd, sel),
+                                     vcpu->arch.gprs[rt]);
+
                        if ((rd == MIPS_CP0_TLB_INDEX)
                            && (vcpu->arch.gprs[rt] >=
                                KVM_MIPS_GUEST_TLB_SIZE)) {
@@ -1078,16 +1125,15 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
                                        kvm_read_c0_guest_ebase(cop0));
                        } else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
-                               uint32_t nasid =
+                               u32 nasid =
                                        vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
                                if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
                                    ((kvm_read_c0_guest_entryhi(cop0) &
                                      KVM_ENTRYHI_ASID) != nasid)) {
-                                       kvm_debug("MTCz, change ASID from %#lx to %#lx\n",
+                                       trace_kvm_asid_change(vcpu,
                                                kvm_read_c0_guest_entryhi(cop0)
-                                               & KVM_ENTRYHI_ASID,
-                                               vcpu->arch.gprs[rt]
-                                               & KVM_ENTRYHI_ASID);
+                                                       & KVM_ENTRYHI_ASID,
+                                               nasid);
 
                                        /* Blow away the shadow host TLBs */
                                        kvm_mips_flush_host_tlb(1);
@@ -1100,10 +1146,6 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
                                goto done;
                        } else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) {
-                               kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n",
-                                         pc, kvm_read_c0_guest_compare(cop0),
-                                         vcpu->arch.gprs[rt]);
-
                                /* If we are writing to COMPARE */
                                /* Clear pending timer interrupt, if any */
                                kvm_mips_write_compare(vcpu,
@@ -1155,7 +1197,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                 * it first.
                                 */
                                if (change & ST0_CU1 && !(val & ST0_FR) &&
-                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+                                   vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
                                        kvm_lose_fpu(vcpu);
 
                                /*
@@ -1166,7 +1208,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                 * the near future.
                                 */
                                if (change & ST0_CU1 &&
-                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+                                   vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
                                        change_c0_status(ST0_CU1, val);
 
                                preempt_enable();
@@ -1201,7 +1243,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                 * context is already loaded.
                                 */
                                if (change & MIPS_CONF5_FRE &&
-                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+                                   vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
                                        change_c0_config5(MIPS_CONF5_FRE, val);
 
                                /*
@@ -1211,7 +1253,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                 * quickly enabled again in the near future.
                                 */
                                if (change & MIPS_CONF5_MSAEN &&
-                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+                                   vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
                                        change_c0_config5(MIPS_CONF5_MSAEN,
                                                          val);
 
@@ -1219,7 +1261,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
 
                                kvm_write_c0_guest_config5(cop0, val);
                        } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
-                               uint32_t old_cause, new_cause;
+                               u32 old_cause, new_cause;
 
                                old_cause = kvm_read_c0_guest_cause(cop0);
                                new_cause = vcpu->arch.gprs[rt];
@@ -1233,20 +1275,30 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                        else
                                                kvm_mips_count_enable_cause(vcpu);
                                }
+                       } else if ((rd == MIPS_CP0_HWRENA) && (sel == 0)) {
+                               u32 mask = MIPS_HWRENA_CPUNUM |
+                                          MIPS_HWRENA_SYNCISTEP |
+                                          MIPS_HWRENA_CC |
+                                          MIPS_HWRENA_CCRES;
+
+                               if (kvm_read_c0_guest_config3(cop0) &
+                                   MIPS_CONF3_ULRI)
+                                       mask |= MIPS_HWRENA_ULR;
+                               cop0->reg[rd][sel] = vcpu->arch.gprs[rt] & mask;
                        } else {
                                cop0->reg[rd][sel] = vcpu->arch.gprs[rt];
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
                                kvm_mips_trans_mtc0(inst, opc, vcpu);
 #endif
                        }
-
-                       kvm_debug("[%#x] MTCz, cop0->reg[%d][%d]: %#lx\n", pc,
-                                 rd, sel, cop0->reg[rd][sel]);
                        break;
 
                case dmtc_op:
                        kvm_err("!!!!!!![%#lx]dmtc_op: rt: %d, rd: %d, sel: %d!!!!!!\n",
                                vcpu->arch.pc, rt, rd, sel);
+                       trace_kvm_hwr(vcpu, KVM_TRACE_DMTC0,
+                                     KVM_TRACE_COP0(rd, sel),
+                                     vcpu->arch.gprs[rt]);
                        er = EMULATE_FAIL;
                        break;
 
@@ -1258,7 +1310,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                vcpu->arch.gprs[rt] =
                                    kvm_read_c0_guest_status(cop0);
                        /* EI */
-                       if (inst & 0x20) {
+                       if (inst.mfmc0_format.sc) {
                                kvm_debug("[%#lx] mfmc0_op: EI\n",
                                          vcpu->arch.pc);
                                kvm_set_c0_guest_status(cop0, ST0_IE);
@@ -1272,9 +1324,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
 
                case wrpgpr_op:
                        {
-                               uint32_t css =
-                                   cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
-                               uint32_t pss =
+                               u32 css = cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
+                               u32 pss =
                                    (cop0->reg[MIPS_CP0_STATUS][2] >> 6) & 0xf;
                                /*
                                 * We don't support any shadow register sets, so
@@ -1291,7 +1342,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                        break;
                default:
                        kvm_err("[%#lx]MachEmulateCP0: unsupported COP0, copz: 0x%x\n",
-                               vcpu->arch.pc, copz);
+                               vcpu->arch.pc, inst.c0r_format.rs);
                        er = EMULATE_FAIL;
                        break;
                }
@@ -1312,13 +1363,14 @@ dont_update_pc:
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+                                            u32 cause,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DO_MMIO;
-       int32_t op, base, rt, offset;
-       uint32_t bytes;
+       u32 rt;
+       u32 bytes;
        void *data = run->mmio.data;
        unsigned long curr_pc;
 
@@ -1331,12 +1383,9 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
        if (er == EMULATE_FAIL)
                return er;
 
-       rt = (inst >> 16) & 0x1f;
-       base = (inst >> 21) & 0x1f;
-       offset = inst & 0xffff;
-       op = (inst >> 26) & 0x3f;
+       rt = inst.i_format.rt;
 
-       switch (op) {
+       switch (inst.i_format.opcode) {
        case sb_op:
                bytes = 1;
                if (bytes > sizeof(run->mmio.data)) {
@@ -1357,7 +1406,7 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
                *(u8 *) data = vcpu->arch.gprs[rt];
                kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
                          vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt],
-                         *(uint8_t *) data);
+                         *(u8 *) data);
 
                break;
 
@@ -1379,11 +1428,11 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
                run->mmio.is_write = 1;
                vcpu->mmio_needed = 1;
                vcpu->mmio_is_write = 1;
-               *(uint32_t *) data = vcpu->arch.gprs[rt];
+               *(u32 *) data = vcpu->arch.gprs[rt];
 
                kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n",
                          vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-                         vcpu->arch.gprs[rt], *(uint32_t *) data);
+                         vcpu->arch.gprs[rt], *(u32 *) data);
                break;
 
        case sh_op:
@@ -1404,15 +1453,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
                run->mmio.is_write = 1;
                vcpu->mmio_needed = 1;
                vcpu->mmio_is_write = 1;
-               *(uint16_t *) data = vcpu->arch.gprs[rt];
+               *(u16 *) data = vcpu->arch.gprs[rt];
 
                kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n",
                          vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-                         vcpu->arch.gprs[rt], *(uint32_t *) data);
+                         vcpu->arch.gprs[rt], *(u32 *) data);
                break;
 
        default:
-               kvm_err("Store not yet supported");
+               kvm_err("Store not yet supported (inst=0x%08x)\n",
+                       inst.word);
                er = EMULATE_FAIL;
                break;
        }
@@ -1424,18 +1474,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
-                                           struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+                                           u32 cause, struct kvm_run *run,
                                            struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DO_MMIO;
-       int32_t op, base, rt, offset;
-       uint32_t bytes;
+       u32 op, rt;
+       u32 bytes;
 
-       rt = (inst >> 16) & 0x1f;
-       base = (inst >> 21) & 0x1f;
-       offset = inst & 0xffff;
-       op = (inst >> 26) & 0x3f;
+       rt = inst.i_format.rt;
+       op = inst.i_format.opcode;
 
        vcpu->arch.pending_load_cause = cause;
        vcpu->arch.io_gpr = rt;
@@ -1521,7 +1569,8 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
                break;
 
        default:
-               kvm_err("Load not yet supported");
+               kvm_err("Load not yet supported (inst=0x%08x)\n",
+                       inst.word);
                er = EMULATE_FAIL;
                break;
        }
@@ -1529,40 +1578,15 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
        return er;
 }
 
-int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
-{
-       unsigned long offset = (va & ~PAGE_MASK);
-       struct kvm *kvm = vcpu->kvm;
-       unsigned long pa;
-       gfn_t gfn;
-       kvm_pfn_t pfn;
-
-       gfn = va >> PAGE_SHIFT;
-
-       if (gfn >= kvm->arch.guest_pmap_npages) {
-               kvm_err("%s: Invalid gfn: %#llx\n", __func__, gfn);
-               kvm_mips_dump_host_tlbs();
-               kvm_arch_vcpu_dump_regs(vcpu);
-               return -1;
-       }
-       pfn = kvm->arch.guest_pmap[gfn];
-       pa = (pfn << PAGE_SHIFT) | offset;
-
-       kvm_debug("%s: va: %#lx, unmapped: %#x\n", __func__, va,
-                 CKSEG0ADDR(pa));
-
-       local_flush_icache_range(CKSEG0ADDR(pa), 32);
-       return 0;
-}
-
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
-                                            uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+                                            u32 *opc, u32 cause,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        enum emulation_result er = EMULATE_DONE;
-       int32_t offset, cache, op_inst, op, base;
+       u32 cache, op_inst, op, base;
+       s16 offset;
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        unsigned long va;
        unsigned long curr_pc;
@@ -1576,9 +1600,12 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
        if (er == EMULATE_FAIL)
                return er;
 
-       base = (inst >> 21) & 0x1f;
-       op_inst = (inst >> 16) & 0x1f;
-       offset = (int16_t)inst;
+       base = inst.i_format.rs;
+       op_inst = inst.i_format.rt;
+       if (cpu_has_mips_r6)
+               offset = inst.spec3_format.simmediate;
+       else
+               offset = inst.i_format.simmediate;
        cache = op_inst & CacheOp_Cache;
        op = op_inst & CacheOp_Op;
 
@@ -1634,7 +1661,6 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
                                                   (cop0) & KVM_ENTRYHI_ASID));
 
                if (index < 0) {
-                       vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK);
                        vcpu->arch.host_cp0_badvaddr = va;
                        vcpu->arch.pc = curr_pc;
                        er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
@@ -1659,9 +1685,7 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
                                 * We fault an entry from the guest tlb to the
                                 * shadow host TLB
                                 */
-                               kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
-                                                                    NULL,
-                                                                    NULL);
+                               kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
                        }
                }
        } else {
@@ -1714,20 +1738,20 @@ dont_update_pc:
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
                                            struct kvm_run *run,
                                            struct kvm_vcpu *vcpu)
 {
+       union mips_instruction inst;
        enum emulation_result er = EMULATE_DONE;
-       uint32_t inst;
 
        /* Fetch the instruction. */
        if (cause & CAUSEF_BD)
                opc += 1;
 
-       inst = kvm_get_inst(opc, vcpu);
+       inst.word = kvm_get_inst(opc, vcpu);
 
-       switch (((union mips_instruction)inst).r_format.opcode) {
+       switch (inst.r_format.opcode) {
        case cop0_op:
                er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
                break;
@@ -1744,15 +1768,31 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
                er = kvm_mips_emulate_load(inst, cause, run, vcpu);
                break;
 
+#ifndef CONFIG_CPU_MIPSR6
        case cache_op:
                ++vcpu->stat.cache_exits;
-               trace_kvm_exit(vcpu, CACHE_EXITS);
+               trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
                er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
                break;
+#else
+       case spec3_op:
+               switch (inst.spec3_format.func) {
+               case cache6_op:
+                       ++vcpu->stat.cache_exits;
+                       trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+                       er = kvm_mips_emulate_cache(inst, opc, cause, run,
+                                                   vcpu);
+                       break;
+               default:
+                       goto unknown;
+               };
+               break;
+unknown:
+#endif
 
        default:
                kvm_err("Instruction emulation not supported (%p/%#x)\n", opc,
-                       inst);
+                       inst.word);
                kvm_arch_vcpu_dump_regs(vcpu);
                er = EMULATE_FAIL;
                break;
@@ -1761,8 +1801,8 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_emulate_syscall(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
 {
@@ -1796,8 +1836,8 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
-                                                 uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
@@ -1842,8 +1882,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
-                                                uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
+                                                u32 *opc,
                                                 struct kvm_run *run,
                                                 struct kvm_vcpu *vcpu)
 {
@@ -1888,8 +1928,8 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
-                                                 uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
@@ -1932,8 +1972,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
-                                                uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
+                                                u32 *opc,
                                                 struct kvm_run *run,
                                                 struct kvm_vcpu *vcpu)
 {
@@ -1977,7 +2017,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
 }
 
 /* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
 {
@@ -2005,8 +2045,8 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
-                                             uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
+                                             u32 *opc,
                                              struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
 {
@@ -2048,8 +2088,8 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
 {
@@ -2077,8 +2117,8 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
-                                             uint32_t *opc,
+enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
+                                             u32 *opc,
                                              struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
 {
@@ -2112,8 +2152,8 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
-                                             uint32_t *opc,
+enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
+                                             u32 *opc,
                                              struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
 {
@@ -2147,8 +2187,8 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
-                                               uint32_t *opc,
+enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
+                                               u32 *opc,
                                                struct kvm_run *run,
                                                struct kvm_vcpu *vcpu)
 {
@@ -2182,8 +2222,8 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
-                                                 uint32_t *opc,
+enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
@@ -2217,8 +2257,8 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
 {
@@ -2252,8 +2292,8 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
-                                                 uint32_t *opc,
+enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
@@ -2287,22 +2327,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
        return er;
 }
 
-/* ll/sc, rdhwr, sync emulation */
-
-#define OPCODE 0xfc000000
-#define BASE   0x03e00000
-#define RT     0x001f0000
-#define OFFSET 0x0000ffff
-#define LL     0xc0000000
-#define SC     0xe0000000
-#define SPEC0  0x00000000
-#define SPEC3  0x7c000000
-#define RD     0x0000f800
-#define FUNC   0x0000003f
-#define SYNC   0x0000000f
-#define RDHWR  0x0000003b
-
-enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
                                         struct kvm_run *run,
                                         struct kvm_vcpu *vcpu)
 {
@@ -2310,7 +2335,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        enum emulation_result er = EMULATE_DONE;
        unsigned long curr_pc;
-       uint32_t inst;
+       union mips_instruction inst;
 
        /*
         * Update PC and hold onto current PC in case there is
@@ -2325,17 +2350,22 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
        if (cause & CAUSEF_BD)
                opc += 1;
 
-       inst = kvm_get_inst(opc, vcpu);
+       inst.word = kvm_get_inst(opc, vcpu);
 
-       if (inst == KVM_INVALID_INST) {
+       if (inst.word == KVM_INVALID_INST) {
                kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
                return EMULATE_FAIL;
        }
 
-       if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) {
+       if (inst.r_format.opcode == spec3_op &&
+           inst.r_format.func == rdhwr_op &&
+           inst.r_format.rs == 0 &&
+           (inst.r_format.re >> 3) == 0) {
                int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
-               int rd = (inst & RD) >> 11;
-               int rt = (inst & RT) >> 16;
+               int rd = inst.r_format.rd;
+               int rt = inst.r_format.rt;
+               int sel = inst.r_format.re & 0x7;
+
                /* If usermode, check RDHWR rd is allowed by guest HWREna */
                if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
                        kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n",
@@ -2343,17 +2373,17 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                        goto emulate_ri;
                }
                switch (rd) {
-               case 0: /* CPU number */
-                       arch->gprs[rt] = 0;
+               case MIPS_HWR_CPUNUM:           /* CPU number */
+                       arch->gprs[rt] = vcpu->vcpu_id;
                        break;
-               case 1: /* SYNCI length */
+               case MIPS_HWR_SYNCISTEP:        /* SYNCI length */
                        arch->gprs[rt] = min(current_cpu_data.dcache.linesz,
                                             current_cpu_data.icache.linesz);
                        break;
-               case 2: /* Read count register */
-                       arch->gprs[rt] = kvm_mips_read_count(vcpu);
+               case MIPS_HWR_CC:               /* Read count register */
+                       arch->gprs[rt] = (s32)kvm_mips_read_count(vcpu);
                        break;
-               case 3: /* Count register resolution */
+               case MIPS_HWR_CCRES:            /* Count register resolution */
                        switch (current_cpu_data.cputype) {
                        case CPU_20KC:
                        case CPU_25KF:
@@ -2363,7 +2393,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                                arch->gprs[rt] = 2;
                        }
                        break;
-               case 29:
+               case MIPS_HWR_ULR:              /* Read UserLocal register */
                        arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0);
                        break;
 
@@ -2371,8 +2401,12 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                        kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc);
                        goto emulate_ri;
                }
+
+               trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel),
+                             vcpu->arch.gprs[rt]);
        } else {
-               kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
+               kvm_debug("Emulate RI not supported @ %p: %#x\n",
+                         opc, inst.word);
                goto emulate_ri;
        }
 
@@ -2405,19 +2439,19 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 
        switch (run->mmio.len) {
        case 4:
-               *gpr = *(int32_t *) run->mmio.data;
+               *gpr = *(s32 *) run->mmio.data;
                break;
 
        case 2:
                if (vcpu->mmio_needed == 2)
-                       *gpr = *(int16_t *) run->mmio.data;
+                       *gpr = *(s16 *) run->mmio.data;
                else
-                       *gpr = *(uint16_t *)run->mmio.data;
+                       *gpr = *(u16 *)run->mmio.data;
 
                break;
        case 1:
                if (vcpu->mmio_needed == 2)
-                       *gpr = *(int8_t *) run->mmio.data;
+                       *gpr = *(s8 *) run->mmio.data;
                else
                        *gpr = *(u8 *) run->mmio.data;
                break;
@@ -2432,12 +2466,12 @@ done:
        return er;
 }
 
-static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
-                                                 uint32_t *opc,
+static enum emulation_result kvm_mips_emulate_exc(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
-       uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+       u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        enum emulation_result er = EMULATE_DONE;
@@ -2470,13 +2504,13 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DONE;
-       uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+       u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 
        int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
@@ -2566,18 +2600,18 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause,
  * (2) TLB entry is present in the Guest TLB but not in the shadow, in this
  *     case we inject the TLB from the Guest TLB into the shadow host TLB
  */
-enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
-                                             uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
+                                             u32 *opc,
                                              struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DONE;
-       uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+       u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
        unsigned long va = vcpu->arch.host_cp0_badvaddr;
        int index;
 
-       kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx, entryhi: %#lx\n",
-                 vcpu->arch.host_cp0_badvaddr, vcpu->arch.host_cp0_entryhi);
+       kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx\n",
+                 vcpu->arch.host_cp0_badvaddr);
 
        /*
         * KVM would not have got the exception if this entry was valid in the
@@ -2620,13 +2654,12 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
                        }
                } else {
                        kvm_debug("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n",
-                                 tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1);
+                                 tlb->tlb_hi, tlb->tlb_lo[0], tlb->tlb_lo[1]);
                        /*
                         * OK we have a Guest TLB entry, now inject it into the
                         * shadow host TLB
                         */
-                       kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL,
-                                                            NULL);
+                       kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
                }
        }
 
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
new file mode 100644 (file)
index 0000000..6a02b3a
--- /dev/null
@@ -0,0 +1,701 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Generation of main entry point for the guest, exception handling.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ *
+ * Copyright (C) 2016 Imagination Technologies Ltd.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/msa.h>
+#include <asm/setup.h>
+#include <asm/uasm.h>
+
+/* Register names */
+#define ZERO           0
+#define AT             1
+#define V0             2
+#define V1             3
+#define A0             4
+#define A1             5
+
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#define T0             8
+#define T1             9
+#define T2             10
+#define T3             11
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
+
+#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32
+#define T0             12
+#define T1             13
+#define T2             14
+#define T3             15
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */
+
+#define S0             16
+#define S1             17
+#define T9             25
+#define K0             26
+#define K1             27
+#define GP             28
+#define SP             29
+#define RA             31
+
+/* Some CP0 registers */
+#define C0_HWRENA      7, 0
+#define C0_BADVADDR    8, 0
+#define C0_ENTRYHI     10, 0
+#define C0_STATUS      12, 0
+#define C0_CAUSE       13, 0
+#define C0_EPC         14, 0
+#define C0_EBASE       15, 1
+#define C0_CONFIG5     16, 5
+#define C0_DDATA_LO    28, 3
+#define C0_ERROREPC    30, 0
+
+#define CALLFRAME_SIZ   32
+
+#ifdef CONFIG_64BIT
+#define ST0_KX_IF_64   ST0_KX
+#else
+#define ST0_KX_IF_64   0
+#endif
+
+static unsigned int scratch_vcpu[2] = { C0_DDATA_LO };
+static unsigned int scratch_tmp[2] = { C0_ERROREPC };
+
+enum label_id {
+       label_fpu_1 = 1,
+       label_msa_1,
+       label_return_to_host,
+       label_kernel_asid,
+       label_exit_common,
+};
+
+UASM_L_LA(_fpu_1)
+UASM_L_LA(_msa_1)
+UASM_L_LA(_return_to_host)
+UASM_L_LA(_kernel_asid)
+UASM_L_LA(_exit_common)
+
+static void *kvm_mips_build_enter_guest(void *addr);
+static void *kvm_mips_build_ret_from_exit(void *addr);
+static void *kvm_mips_build_ret_to_guest(void *addr);
+static void *kvm_mips_build_ret_to_host(void *addr);
+
+/**
+ * kvm_mips_entry_setup() - Perform global setup for entry code.
+ *
+ * Perform global setup for entry code, such as choosing a scratch register.
+ *
+ * Returns:    0 on success.
+ *             -errno on failure.
+ */
+int kvm_mips_entry_setup(void)
+{
+       /*
+        * We prefer to use KScratchN registers if they are available over the
+        * defaults above, which may not work on all cores.
+        */
+       unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+
+       /* Pick a scratch register for storing VCPU */
+       if (kscratch_mask) {
+               scratch_vcpu[0] = 31;
+               scratch_vcpu[1] = ffs(kscratch_mask) - 1;
+               kscratch_mask &= ~BIT(scratch_vcpu[1]);
+       }
+
+       /* Pick a scratch register to use as a temp for saving state */
+       if (kscratch_mask) {
+               scratch_tmp[0] = 31;
+               scratch_tmp[1] = ffs(kscratch_mask) - 1;
+               kscratch_mask &= ~BIT(scratch_tmp[1]);
+       }
+
+       return 0;
+}
+
+static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
+                                       unsigned int frame)
+{
+       /* Save the VCPU scratch register value in cp0_epc of the stack frame */
+       UASM_i_MFC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+       UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+
+       /* Save the temp scratch register value in cp0_cause of stack frame */
+       if (scratch_tmp[0] == 31) {
+               UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+               UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+       }
+}
+
+static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
+                                          unsigned int frame)
+{
+       /*
+        * Restore host scratch register values saved by
+        * kvm_mips_build_save_scratch().
+        */
+       UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+       UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+
+       if (scratch_tmp[0] == 31) {
+               UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+               UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+       }
+}
+
+/**
+ * build_set_exc_base() - Assemble code to write exception base address.
+ * @p:         Code buffer pointer.
+ * @reg:       Source register (generated code may set WG bit in @reg).
+ *
+ * Assemble code to modify the exception base address in the EBase register,
+ * using the appropriately sized access and setting the WG bit if necessary.
+ */
+static inline void build_set_exc_base(u32 **p, unsigned int reg)
+{
+       if (cpu_has_ebase_wg) {
+               /* Set WG so that all the bits get written */
+               uasm_i_ori(p, reg, reg, MIPS_EBASE_WG);
+               UASM_i_MTC0(p, reg, C0_EBASE);
+       } else {
+               uasm_i_mtc0(p, reg, C0_EBASE);
+       }
+}
+
+/**
+ * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the start of the vcpu_run function to run a guest VCPU. The function
+ * conforms to the following prototype:
+ *
+ * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ *
+ * The exit from the guest and return to the caller is handled by the code
+ * generated by kvm_mips_build_ret_to_host().
+ *
+ * Returns:    Next address after end of written function.
+ */
+void *kvm_mips_build_vcpu_run(void *addr)
+{
+       u32 *p = addr;
+       unsigned int i;
+
+       /*
+        * A0: run
+        * A1: vcpu
+        */
+
+       /* k0/k1 not being used in host kernel context */
+       UASM_i_ADDIU(&p, K1, SP, -(int)sizeof(struct pt_regs));
+       for (i = 16; i < 32; ++i) {
+               if (i == 24)
+                       i = 28;
+               UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+       }
+
+       /* Save host status */
+       uasm_i_mfc0(&p, V0, C0_STATUS);
+       UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1);
+
+       /* Save scratch registers, will be used to store pointer to vcpu etc */
+       kvm_mips_build_save_scratch(&p, V1, K1);
+
+       /* VCPU scratch register has pointer to vcpu */
+       UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+
+       /* Offset into vcpu->arch */
+       UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+
+       /*
+        * Save the host stack to VCPU, used for exception processing
+        * when we exit from the Guest
+        */
+       UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+       /* Save the kernel gp as well */
+       UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+       /*
+        * Setup status register for running the guest in UM, interrupts
+        * are disabled
+        */
+       UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV | ST0_KX_IF_64);
+       uasm_i_mtc0(&p, K0, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       /* load up the new EBASE */
+       UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+       build_set_exc_base(&p, K0);
+
+       /*
+        * Now that the new EBASE has been loaded, unset BEV, set
+        * interrupt mask as it was but make sure that timer interrupts
+        * are enabled
+        */
+       uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE | ST0_KX_IF_64);
+       uasm_i_andi(&p, V0, V0, ST0_IM);
+       uasm_i_or(&p, K0, K0, V0);
+       uasm_i_mtc0(&p, K0, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       p = kvm_mips_build_enter_guest(p);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_enter_guest() - Assemble code to resume guest execution.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the code to resume guest execution. This code is common between the
+ * initial entry into the guest from the host, and returning from the exit
+ * handler back to the guest.
+ *
+ * Returns:    Next address after end of written function.
+ */
+static void *kvm_mips_build_enter_guest(void *addr)
+{
+       u32 *p = addr;
+       unsigned int i;
+       struct uasm_label labels[2];
+       struct uasm_reloc relocs[2];
+       struct uasm_label *l = labels;
+       struct uasm_reloc *r = relocs;
+
+       memset(labels, 0, sizeof(labels));
+       memset(relocs, 0, sizeof(relocs));
+
+       /* Set Guest EPC */
+       UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
+       UASM_i_MTC0(&p, T0, C0_EPC);
+
+       /* Set the ASID for the Guest Kernel */
+       UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
+       UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
+                 T0);
+       uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
+       uasm_i_xori(&p, T0, T0, KSU_USER);
+       uasm_il_bnez(&p, &r, T0, label_kernel_asid);
+        UASM_i_ADDIU(&p, T1, K1,
+                     offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+       /* else user */
+       UASM_i_ADDIU(&p, T1, K1,
+                    offsetof(struct kvm_vcpu_arch, guest_user_asid));
+       uasm_l_kernel_asid(&l, p);
+
+       /* t1: contains the base of the ASID array, need to get the cpu id  */
+       /* smp_processor_id */
+       uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
+       /* x4 */
+       uasm_i_sll(&p, T2, T2, 2);
+       UASM_i_ADDU(&p, T3, T1, T2);
+       uasm_i_lw(&p, K0, 0, T3);
+#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
+       /* x sizeof(struct cpuinfo_mips)/4 */
+       uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+       uasm_i_mul(&p, T2, T2, T3);
+
+       UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
+       UASM_i_ADDU(&p, AT, AT, T2);
+       UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT);
+       uasm_i_and(&p, K0, K0, T2);
+#else
+       uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
+#endif
+       uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+       uasm_i_ehb(&p);
+
+       /* Disable RDHWR access */
+       uasm_i_mtc0(&p, ZERO, C0_HWRENA);
+
+       /* load the guest context from VCPU and return */
+       for (i = 1; i < 32; ++i) {
+               /* Guest k0/k1 loaded later */
+               if (i == K0 || i == K1)
+                       continue;
+               UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+       }
+
+#ifndef CONFIG_CPU_MIPSR6
+       /* Restore hi/lo */
+       UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1);
+       uasm_i_mthi(&p, K0);
+
+       UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1);
+       uasm_i_mtlo(&p, K0);
+#endif
+
+       /* Restore the guest's k0/k1 registers */
+       UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+       UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+       /* Jump to guest */
+       uasm_i_eret(&p);
+
+       uasm_resolve_relocs(relocs, labels);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_exception() - Assemble first level guest exception handler.
+ * @addr:      Address to start writing code.
+ * @handler:   Address of common handler (within range of @addr).
+ *
+ * Assemble exception vector code for guest execution. The generated vector will
+ * branch to the common exception handler generated by kvm_mips_build_exit().
+ *
+ * Returns:    Next address after end of written function.
+ */
+void *kvm_mips_build_exception(void *addr, void *handler)
+{
+       u32 *p = addr;
+       struct uasm_label labels[2];
+       struct uasm_reloc relocs[2];
+       struct uasm_label *l = labels;
+       struct uasm_reloc *r = relocs;
+
+       memset(labels, 0, sizeof(labels));
+       memset(relocs, 0, sizeof(relocs));
+
+       /* Save guest k1 into scratch register */
+       UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+       /* Get the VCPU pointer from the VCPU scratch register */
+       UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+       UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+       /* Save guest k0 into VCPU structure */
+       UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+
+       /* Branch to the common handler */
+       uasm_il_b(&p, &r, label_exit_common);
+        uasm_i_nop(&p);
+
+       uasm_l_exit_common(&l, handler);
+       uasm_resolve_relocs(relocs, labels);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_exit() - Assemble common guest exit handler.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the generic guest exit handling code. This is called by the
+ * exception vectors (generated by kvm_mips_build_exception()), and calls
+ * kvm_mips_handle_exit(), then either resumes the guest or returns to the host
+ * depending on the return value.
+ *
+ * Returns:    Next address after end of written function.
+ */
+void *kvm_mips_build_exit(void *addr)
+{
+       u32 *p = addr;
+       unsigned int i;
+       struct uasm_label labels[3];
+       struct uasm_reloc relocs[3];
+       struct uasm_label *l = labels;
+       struct uasm_reloc *r = relocs;
+
+       memset(labels, 0, sizeof(labels));
+       memset(relocs, 0, sizeof(relocs));
+
+       /*
+        * Generic Guest exception handler. We end up here when the guest
+        * does something that causes a trap to kernel mode.
+        *
+        * Both k0/k1 registers will have already been saved (k0 into the vcpu
+        * structure, and k1 into the scratch_tmp register).
+        *
+        * The k1 register will already contain the kvm_vcpu_arch pointer.
+        */
+
+       /* Start saving Guest context to VCPU */
+       for (i = 0; i < 32; ++i) {
+               /* Guest k0/k1 saved later */
+               if (i == K0 || i == K1)
+                       continue;
+               UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+       }
+
+#ifndef CONFIG_CPU_MIPSR6
+       /* We need to save hi/lo and restore them on the way out */
+       uasm_i_mfhi(&p, T0);
+       UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1);
+
+       uasm_i_mflo(&p, T0);
+       UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
+#endif
+
+       /* Finally save guest k1 to VCPU */
+       uasm_i_ehb(&p);
+       UASM_i_MFC0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
+       UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+       /* Now that context has been saved, we can use other registers */
+
+       /* Restore vcpu */
+       UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+       uasm_i_move(&p, S1, A1);
+
+       /* Restore run (vcpu->run) */
+       UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
+       /* Save pointer to run in s0, will be saved by the compiler */
+       uasm_i_move(&p, S0, A0);
+
+       /*
+        * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
+        * the exception
+        */
+       UASM_i_MFC0(&p, K0, C0_EPC);
+       UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1);
+
+       UASM_i_MFC0(&p, K0, C0_BADVADDR);
+       UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr),
+                 K1);
+
+       uasm_i_mfc0(&p, K0, C0_CAUSE);
+       uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
+
+       /* Now restore the host state just enough to run the handlers */
+
+       /* Switch EBASE to the one used by Linux */
+       /* load up the host EBASE */
+       uasm_i_mfc0(&p, V0, C0_STATUS);
+
+       uasm_i_lui(&p, AT, ST0_BEV >> 16);
+       uasm_i_or(&p, K0, V0, AT);
+
+       uasm_i_mtc0(&p, K0, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       UASM_i_LA_mostly(&p, K0, (long)&ebase);
+       UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0);
+       build_set_exc_base(&p, K0);
+
+       if (raw_cpu_has_fpu) {
+               /*
+                * If FPU is enabled, save FCR31 and clear it so that later
+                * ctc1's don't trigger FPE for pending exceptions.
+                */
+               uasm_i_lui(&p, AT, ST0_CU1 >> 16);
+               uasm_i_and(&p, V1, V0, AT);
+               uasm_il_beqz(&p, &r, V1, label_fpu_1);
+                uasm_i_nop(&p);
+               uasm_i_cfc1(&p, T0, 31);
+               uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31),
+                         K1);
+               uasm_i_ctc1(&p, ZERO, 31);
+               uasm_l_fpu_1(&l, p);
+       }
+
+       if (cpu_has_msa) {
+               /*
+                * If MSA is enabled, save MSACSR and clear it so that later
+                * instructions don't trigger MSAFPE for pending exceptions.
+                */
+               uasm_i_mfc0(&p, T0, C0_CONFIG5);
+               uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */
+               uasm_il_beqz(&p, &r, T0, label_msa_1);
+                uasm_i_nop(&p);
+               uasm_i_cfcmsa(&p, T0, MSA_CSR);
+               uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr),
+                         K1);
+               uasm_i_ctcmsa(&p, MSA_CSR, ZERO);
+               uasm_l_msa_1(&l, p);
+       }
+
+       /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
+       uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
+       uasm_i_and(&p, V0, V0, AT);
+       uasm_i_lui(&p, AT, ST0_CU0 >> 16);
+       uasm_i_or(&p, V0, V0, AT);
+       uasm_i_mtc0(&p, V0, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       /* Load up host GP */
+       UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+       /* Need a stack before we can jump to "C" */
+       UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+       /* Saved host state */
+       UASM_i_ADDIU(&p, SP, SP, -(int)sizeof(struct pt_regs));
+
+       /*
+        * XXXKYMA do we need to load the host ASID, maybe not because the
+        * kernel entries are marked GLOBAL, need to verify
+        */
+
+       /* Restore host scratch registers, as we'll have clobbered them */
+       kvm_mips_build_restore_scratch(&p, K0, SP);
+
+       /* Restore RDHWR access */
+       UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+       uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+       uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+       /* Jump to handler */
+       /*
+        * XXXKYMA: not sure if this is safe, how large is the stack??
+        * Now jump to the kvm_mips_handle_exit() to see if we can deal
+        * with this in the kernel
+        */
+       UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
+       uasm_i_jalr(&p, RA, T9);
+        UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
+
+       uasm_resolve_relocs(relocs, labels);
+
+       p = kvm_mips_build_ret_from_exit(p);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the code to handle the return from kvm_mips_handle_exit(), either
+ * resuming the guest or returning to the host depending on the return value.
+ *
+ * Returns:    Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_from_exit(void *addr)
+{
+       u32 *p = addr;
+       struct uasm_label labels[2];
+       struct uasm_reloc relocs[2];
+       struct uasm_label *l = labels;
+       struct uasm_reloc *r = relocs;
+
+       memset(labels, 0, sizeof(labels));
+       memset(relocs, 0, sizeof(relocs));
+
+       /* Return from handler Make sure interrupts are disabled */
+       uasm_i_di(&p, ZERO);
+       uasm_i_ehb(&p);
+
+       /*
+        * XXXKYMA: k0/k1 could have been blown away if we processed
+        * an exception while we were handling the exception from the
+        * guest, reload k1
+        */
+
+       uasm_i_move(&p, K1, S1);
+       UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+       /*
+        * Check return value, should tell us if we are returning to the
+        * host (handle I/O etc)or resuming the guest
+        */
+       uasm_i_andi(&p, T0, V0, RESUME_HOST);
+       uasm_il_bnez(&p, &r, T0, label_return_to_host);
+        uasm_i_nop(&p);
+
+       p = kvm_mips_build_ret_to_guest(p);
+
+       uasm_l_return_to_host(&l, p);
+       p = kvm_mips_build_ret_to_host(p);
+
+       uasm_resolve_relocs(relocs, labels);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the guest.
+ *
+ * Returns:    Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_guest(void *addr)
+{
+       u32 *p = addr;
+
+       /* Put the saved pointer to vcpu (s1) back into the scratch register */
+       UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+
+       /* Load up the Guest EBASE to minimize the window where BEV is set */
+       UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+
+       /* Switch EBASE back to the one used by KVM */
+       uasm_i_mfc0(&p, V1, C0_STATUS);
+       uasm_i_lui(&p, AT, ST0_BEV >> 16);
+       uasm_i_or(&p, K0, V1, AT);
+       uasm_i_mtc0(&p, K0, C0_STATUS);
+       uasm_i_ehb(&p);
+       build_set_exc_base(&p, T0);
+
+       /* Setup status register for running guest in UM */
+       uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE);
+       UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX));
+       uasm_i_and(&p, V1, V1, AT);
+       uasm_i_mtc0(&p, V1, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       p = kvm_mips_build_enter_guest(p);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_host() - Assemble code to return to the host.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run
+ * function generated by kvm_mips_build_vcpu_run().
+ *
+ * Returns:    Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_host(void *addr)
+{
+       u32 *p = addr;
+       unsigned int i;
+
+       /* EBASE is already pointing to Linux */
+       UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+       UASM_i_ADDIU(&p, K1, K1, -(int)sizeof(struct pt_regs));
+
+       /*
+        * r2/v0 is the return code, shift it down by 2 (arithmetic)
+        * to recover the err code
+        */
+       uasm_i_sra(&p, K0, V0, 2);
+       uasm_i_move(&p, V0, K0);
+
+       /* Load context saved on the host stack */
+       for (i = 16; i < 31; ++i) {
+               if (i == 24)
+                       i = 28;
+               UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+       }
+
+       /* Restore RDHWR access */
+       UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+       uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+       uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+       /* Restore RA, which is the address we will return to */
+       UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1);
+       uasm_i_jr(&p, RA);
+        uasm_i_nop(&p);
+
+       return p;
+}
+
index 531fbf5..16f17c6 100644 (file)
 #include <asm/mipsregs.h>
 #include <asm/regdef.h>
 
+/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */
+#undef fp
+
        .set    noreorder
        .set    noat
 
 LEAF(__kvm_save_fpu)
        .set    push
-       .set    mips64r2
        SET_HARDFLOAT
+       .set    fp=64
        mfc0    t0, CP0_STATUS
        sll     t0, t0, 5                       # is Status.FR set?
        bgez    t0, 1f                          # no: skip odd doubles
@@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu)
 
 LEAF(__kvm_restore_fpu)
        .set    push
-       .set    mips64r2
        SET_HARDFLOAT
+       .set    fp=64
        mfc0    t0, CP0_STATUS
        sll     t0, t0, 5                       # is Status.FR set?
        bgez    t0, 1f                          # no: skip odd doubles
index 95f7906..ad28dac 100644 (file)
 
 #include "interrupt.h"
 
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
 {
        set_bit(priority, &vcpu->arch.pending_exceptions);
 }
 
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
 {
        clear_bit(priority, &vcpu->arch.pending_exceptions);
 }
@@ -114,10 +114,10 @@ void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
 
 /* Deliver the interrupt of the corresponding priority, if possible. */
 int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-                           uint32_t cause)
+                           u32 cause)
 {
        int allowed = 0;
-       uint32_t exccode;
+       u32 exccode;
 
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -196,12 +196,12 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
 }
 
 int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-                         uint32_t cause)
+                         u32 cause)
 {
        return 1;
 }
 
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause)
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause)
 {
        unsigned long *pending = &vcpu->arch.pending_exceptions;
        unsigned long *pending_clr = &vcpu->arch.pending_exceptions_clr;
index 2143884..fb118a2 100644 (file)
 #define MIPS_EXC_MAX                12
 /* XXXSL More to follow */
 
-extern char __kvm_mips_vcpu_run_end[];
-extern char mips32_exception[], mips32_exceptionEnd[];
-extern char mips32_GuestException[], mips32_GuestExceptionEnd[];
-
 #define C_TI        (_ULCAST_(1) << 30)
 
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
 #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (0)
 
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
 int kvm_mips_pending_timer(struct kvm_vcpu *vcpu);
 
 void kvm_mips_queue_timer_int_cb(struct kvm_vcpu *vcpu);
@@ -48,7 +44,7 @@ void kvm_mips_queue_io_int_cb(struct kvm_vcpu *vcpu,
 void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
                                struct kvm_mips_interrupt *irq);
 int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-                           uint32_t cause);
+                           u32 cause);
 int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-                         uint32_t cause);
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause);
+                         u32 cause);
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause);
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
deleted file mode 100644 (file)
index 828fcfc..0000000
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Main entry point for the guest, exception handling.
- *
- * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
- * Authors: Sanjay Lal <sanjayl@kymasys.com>
- */
-
-#include <asm/asm.h>
-#include <asm/asmmacro.h>
-#include <asm/regdef.h>
-#include <asm/mipsregs.h>
-#include <asm/stackframe.h>
-#include <asm/asm-offsets.h>
-
-#define _C_LABEL(x)     x
-#define MIPSX(name)     mips32_ ## name
-#define CALLFRAME_SIZ   32
-
-/*
- * VECTOR
- *  exception vector entrypoint
- */
-#define VECTOR(x, regmask)      \
-    .ent    _C_LABEL(x),0;      \
-    EXPORT(x);
-
-#define VECTOR_END(x)      \
-    EXPORT(x);
-
-/* Overload, Danger Will Robinson!! */
-#define PT_HOST_USERLOCAL   PT_EPC
-
-#define CP0_DDATA_LO        $28,3
-
-/* Resume Flags */
-#define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
-
-#define RESUME_GUEST            0
-#define RESUME_HOST             RESUME_FLAG_HOST
-
-/*
- * __kvm_mips_vcpu_run: entry point to the guest
- * a0: run
- * a1: vcpu
- */
-       .set    noreorder
-
-FEXPORT(__kvm_mips_vcpu_run)
-       /* k0/k1 not being used in host kernel context */
-       INT_ADDIU k1, sp, -PT_SIZE
-       LONG_S  $16, PT_R16(k1)
-       LONG_S  $17, PT_R17(k1)
-       LONG_S  $18, PT_R18(k1)
-       LONG_S  $19, PT_R19(k1)
-       LONG_S  $20, PT_R20(k1)
-       LONG_S  $21, PT_R21(k1)
-       LONG_S  $22, PT_R22(k1)
-       LONG_S  $23, PT_R23(k1)
-
-       LONG_S  $28, PT_R28(k1)
-       LONG_S  $29, PT_R29(k1)
-       LONG_S  $30, PT_R30(k1)
-       LONG_S  $31, PT_R31(k1)
-
-       /* Save hi/lo */
-       mflo    v0
-       LONG_S  v0, PT_LO(k1)
-       mfhi    v1
-       LONG_S  v1, PT_HI(k1)
-
-       /* Save host status */
-       mfc0    v0, CP0_STATUS
-       LONG_S  v0, PT_STATUS(k1)
-
-       /* Save DDATA_LO, will be used to store pointer to vcpu */
-       mfc0    v1, CP0_DDATA_LO
-       LONG_S  v1, PT_HOST_USERLOCAL(k1)
-
-       /* DDATA_LO has pointer to vcpu */
-       mtc0    a1, CP0_DDATA_LO
-
-       /* Offset into vcpu->arch */
-       INT_ADDIU k1, a1, VCPU_HOST_ARCH
-
-       /*
-        * Save the host stack to VCPU, used for exception processing
-        * when we exit from the Guest
-        */
-       LONG_S  sp, VCPU_HOST_STACK(k1)
-
-       /* Save the kernel gp as well */
-       LONG_S  gp, VCPU_HOST_GP(k1)
-
-       /*
-        * Setup status register for running the guest in UM, interrupts
-        * are disabled
-        */
-       li      k0, (ST0_EXL | KSU_USER | ST0_BEV)
-       mtc0    k0, CP0_STATUS
-       ehb
-
-       /* load up the new EBASE */
-       LONG_L  k0, VCPU_GUEST_EBASE(k1)
-       mtc0    k0, CP0_EBASE
-
-       /*
-        * Now that the new EBASE has been loaded, unset BEV, set
-        * interrupt mask as it was but make sure that timer interrupts
-        * are enabled
-        */
-       li      k0, (ST0_EXL | KSU_USER | ST0_IE)
-       andi    v0, v0, ST0_IM
-       or      k0, k0, v0
-       mtc0    k0, CP0_STATUS
-       ehb
-
-       /* Set Guest EPC */
-       LONG_L  t0, VCPU_PC(k1)
-       mtc0    t0, CP0_EPC
-
-FEXPORT(__kvm_mips_load_asid)
-       /* Set the ASID for the Guest Kernel */
-       PTR_L   t0, VCPU_COP0(k1)
-       LONG_L  t0, COP0_STATUS(t0)
-       andi    t0, KSU_USER | ST0_ERL | ST0_EXL
-       xori    t0, KSU_USER
-       bnez    t0, 1f          /* If kernel */
-        INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-       INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
-1:
-       /* t1: contains the base of the ASID array, need to get the cpu id */
-       LONG_L  t2, TI_CPU($28)             /* smp_processor_id */
-       INT_SLL t2, t2, 2                   /* x4 */
-       REG_ADDU t3, t1, t2
-       LONG_L  k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-       li      t3, CPUINFO_SIZE/4
-       mul     t2, t2, t3              /* x sizeof(struct cpuinfo_mips)/4 */
-       LONG_L  t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
-       and     k0, k0, t2
-#else
-       andi    k0, k0, MIPS_ENTRYHI_ASID
-#endif
-       mtc0    k0, CP0_ENTRYHI
-       ehb
-
-       /* Disable RDHWR access */
-       mtc0    zero, CP0_HWRENA
-
-       .set    noat
-       /* Now load up the Guest Context from VCPU */
-       LONG_L  $1, VCPU_R1(k1)
-       LONG_L  $2, VCPU_R2(k1)
-       LONG_L  $3, VCPU_R3(k1)
-
-       LONG_L  $4, VCPU_R4(k1)
-       LONG_L  $5, VCPU_R5(k1)
-       LONG_L  $6, VCPU_R6(k1)
-       LONG_L  $7, VCPU_R7(k1)
-
-       LONG_L  $8, VCPU_R8(k1)
-       LONG_L  $9, VCPU_R9(k1)
-       LONG_L  $10, VCPU_R10(k1)
-       LONG_L  $11, VCPU_R11(k1)
-       LONG_L  $12, VCPU_R12(k1)
-       LONG_L  $13, VCPU_R13(k1)
-       LONG_L  $14, VCPU_R14(k1)
-       LONG_L  $15, VCPU_R15(k1)
-       LONG_L  $16, VCPU_R16(k1)
-       LONG_L  $17, VCPU_R17(k1)
-       LONG_L  $18, VCPU_R18(k1)
-       LONG_L  $19, VCPU_R19(k1)
-       LONG_L  $20, VCPU_R20(k1)
-       LONG_L  $21, VCPU_R21(k1)
-       LONG_L  $22, VCPU_R22(k1)
-       LONG_L  $23, VCPU_R23(k1)
-       LONG_L  $24, VCPU_R24(k1)
-       LONG_L  $25, VCPU_R25(k1)
-
-       /* k0/k1 loaded up later */
-
-       LONG_L  $28, VCPU_R28(k1)
-       LONG_L  $29, VCPU_R29(k1)
-       LONG_L  $30, VCPU_R30(k1)
-       LONG_L  $31, VCPU_R31(k1)
-
-       /* Restore hi/lo */
-       LONG_L  k0, VCPU_LO(k1)
-       mtlo    k0
-
-       LONG_L  k0, VCPU_HI(k1)
-       mthi    k0
-
-FEXPORT(__kvm_mips_load_k0k1)
-       /* Restore the guest's k0/k1 registers */
-       LONG_L  k0, VCPU_R26(k1)
-       LONG_L  k1, VCPU_R27(k1)
-
-       /* Jump to guest */
-       eret
-EXPORT(__kvm_mips_vcpu_run_end)
-
-VECTOR(MIPSX(exception), unknown)
-/* Find out what mode we came from and jump to the proper handler. */
-       mtc0    k0, CP0_ERROREPC        #01: Save guest k0
-       ehb                             #02:
-
-       mfc0    k0, CP0_EBASE           #02: Get EBASE
-       INT_SRL k0, k0, 10              #03: Get rid of CPUNum
-       INT_SLL k0, k0, 10              #04
-       LONG_S  k1, 0x3000(k0)          #05: Save k1 @ offset 0x3000
-       INT_ADDIU k0, k0, 0x2000        #06: Exception handler is
-                                       #    installed @ offset 0x2000
-       j       k0                      #07: jump to the function
-        nop                            #08: branch delay slot
-VECTOR_END(MIPSX(exceptionEnd))
-.end MIPSX(exception)
-
-/*
- * Generic Guest exception handler. We end up here when the guest
- * does something that causes a trap to kernel mode.
- */
-NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
-       /* Get the VCPU pointer from DDTATA_LO */
-       mfc0    k1, CP0_DDATA_LO
-       INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
-       /* Start saving Guest context to VCPU */
-       LONG_S  $0, VCPU_R0(k1)
-       LONG_S  $1, VCPU_R1(k1)
-       LONG_S  $2, VCPU_R2(k1)
-       LONG_S  $3, VCPU_R3(k1)
-       LONG_S  $4, VCPU_R4(k1)
-       LONG_S  $5, VCPU_R5(k1)
-       LONG_S  $6, VCPU_R6(k1)
-       LONG_S  $7, VCPU_R7(k1)
-       LONG_S  $8, VCPU_R8(k1)
-       LONG_S  $9, VCPU_R9(k1)
-       LONG_S  $10, VCPU_R10(k1)
-       LONG_S  $11, VCPU_R11(k1)
-       LONG_S  $12, VCPU_R12(k1)
-       LONG_S  $13, VCPU_R13(k1)
-       LONG_S  $14, VCPU_R14(k1)
-       LONG_S  $15, VCPU_R15(k1)
-       LONG_S  $16, VCPU_R16(k1)
-       LONG_S  $17, VCPU_R17(k1)
-       LONG_S  $18, VCPU_R18(k1)
-       LONG_S  $19, VCPU_R19(k1)
-       LONG_S  $20, VCPU_R20(k1)
-       LONG_S  $21, VCPU_R21(k1)
-       LONG_S  $22, VCPU_R22(k1)
-       LONG_S  $23, VCPU_R23(k1)
-       LONG_S  $24, VCPU_R24(k1)
-       LONG_S  $25, VCPU_R25(k1)
-
-       /* Guest k0/k1 saved later */
-
-       LONG_S  $28, VCPU_R28(k1)
-       LONG_S  $29, VCPU_R29(k1)
-       LONG_S  $30, VCPU_R30(k1)
-       LONG_S  $31, VCPU_R31(k1)
-
-       .set at
-
-       /* We need to save hi/lo and restore them on the way out */
-       mfhi    t0
-       LONG_S  t0, VCPU_HI(k1)
-
-       mflo    t0
-       LONG_S  t0, VCPU_LO(k1)
-
-       /* Finally save guest k0/k1 to VCPU */
-       mfc0    t0, CP0_ERROREPC
-       LONG_S  t0, VCPU_R26(k1)
-
-       /* Get GUEST k1 and save it in VCPU */
-       PTR_LI  t1, ~0x2ff
-       mfc0    t0, CP0_EBASE
-       and     t0, t0, t1
-       LONG_L  t0, 0x3000(t0)
-       LONG_S  t0, VCPU_R27(k1)
-
-       /* Now that context has been saved, we can use other registers */
-
-       /* Restore vcpu */
-       mfc0    a1, CP0_DDATA_LO
-       move    s1, a1
-
-       /* Restore run (vcpu->run) */
-       LONG_L  a0, VCPU_RUN(a1)
-       /* Save pointer to run in s0, will be saved by the compiler */
-       move    s0, a0
-
-       /*
-        * Save Host level EPC, BadVaddr and Cause to VCPU, useful to
-        * process the exception
-        */
-       mfc0    k0,CP0_EPC
-       LONG_S  k0, VCPU_PC(k1)
-
-       mfc0    k0, CP0_BADVADDR
-       LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
-
-       mfc0    k0, CP0_CAUSE
-       LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
-
-       mfc0    k0, CP0_ENTRYHI
-       LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
-
-       /* Now restore the host state just enough to run the handlers */
-
-       /* Switch EBASE to the one used by Linux */
-       /* load up the host EBASE */
-       mfc0    v0, CP0_STATUS
-
-       or      k0, v0, ST0_BEV
-
-       mtc0    k0, CP0_STATUS
-       ehb
-
-       LONG_L  k0, VCPU_HOST_EBASE(k1)
-       mtc0    k0,CP0_EBASE
-
-       /*
-        * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
-        * trigger FPE for pending exceptions.
-        */
-       and     v1, v0, ST0_CU1
-       beqz    v1, 1f
-        nop
-       .set    push
-       SET_HARDFLOAT
-       cfc1    t0, fcr31
-       sw      t0, VCPU_FCR31(k1)
-       ctc1    zero,fcr31
-       .set    pop
-1:
-
-#ifdef CONFIG_CPU_HAS_MSA
-       /*
-        * If MSA is enabled, save MSACSR and clear it so that later
-        * instructions don't trigger MSAFPE for pending exceptions.
-        */
-       mfc0    t0, CP0_CONFIG3
-       ext     t0, t0, 28, 1 /* MIPS_CONF3_MSAP */
-       beqz    t0, 1f
-        nop
-       mfc0    t0, CP0_CONFIG5
-       ext     t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */
-       beqz    t0, 1f
-        nop
-       _cfcmsa t0, MSA_CSR
-       sw      t0, VCPU_MSA_CSR(k1)
-       _ctcmsa MSA_CSR, zero
-1:
-#endif
-
-       /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-       and     v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
-       or      v0, v0, ST0_CU0
-       mtc0    v0, CP0_STATUS
-       ehb
-
-       /* Load up host GP */
-       LONG_L  gp, VCPU_HOST_GP(k1)
-
-       /* Need a stack before we can jump to "C" */
-       LONG_L  sp, VCPU_HOST_STACK(k1)
-
-       /* Saved host state */
-       INT_ADDIU sp, sp, -PT_SIZE
-
-       /*
-        * XXXKYMA do we need to load the host ASID, maybe not because the
-        * kernel entries are marked GLOBAL, need to verify
-        */
-
-       /* Restore host DDATA_LO */
-       LONG_L  k0, PT_HOST_USERLOCAL(sp)
-       mtc0    k0, CP0_DDATA_LO
-
-       /* Restore RDHWR access */
-       PTR_LI  k0, 0x2000000F
-       mtc0    k0, CP0_HWRENA
-
-       /* Jump to handler */
-FEXPORT(__kvm_mips_jump_to_handler)
-       /*
-        * XXXKYMA: not sure if this is safe, how large is the stack??
-        * Now jump to the kvm_mips_handle_exit() to see if we can deal
-        * with this in the kernel
-        */
-       PTR_LA  t9, kvm_mips_handle_exit
-       jalr.hb t9
-        INT_ADDIU sp, sp, -CALLFRAME_SIZ           /* BD Slot */
-
-       /* Return from handler Make sure interrupts are disabled */
-       di
-       ehb
-
-       /*
-        * XXXKYMA: k0/k1 could have been blown away if we processed
-        * an exception while we were handling the exception from the
-        * guest, reload k1
-        */
-
-       move    k1, s1
-       INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
-       /*
-        * Check return value, should tell us if we are returning to the
-        * host (handle I/O etc)or resuming the guest
-        */
-       andi    t0, v0, RESUME_HOST
-       bnez    t0, __kvm_mips_return_to_host
-        nop
-
-__kvm_mips_return_to_guest:
-       /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-       mtc0    s1, CP0_DDATA_LO
-
-       /* Load up the Guest EBASE to minimize the window where BEV is set */
-       LONG_L  t0, VCPU_GUEST_EBASE(k1)
-
-       /* Switch EBASE back to the one used by KVM */
-       mfc0    v1, CP0_STATUS
-       or      k0, v1, ST0_BEV
-       mtc0    k0, CP0_STATUS
-       ehb
-       mtc0    t0, CP0_EBASE
-
-       /* Setup status register for running guest in UM */
-       or      v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-       and     v1, v1, ~(ST0_CU0 | ST0_MX)
-       mtc0    v1, CP0_STATUS
-       ehb
-
-       /* Set Guest EPC */
-       LONG_L  t0, VCPU_PC(k1)
-       mtc0    t0, CP0_EPC
-
-       /* Set the ASID for the Guest Kernel */
-       PTR_L   t0, VCPU_COP0(k1)
-       LONG_L  t0, COP0_STATUS(t0)
-       andi    t0, KSU_USER | ST0_ERL | ST0_EXL
-       xori    t0, KSU_USER
-       bnez    t0, 1f          /* If kernel */
-        INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-       INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
-1:
-       /* t1: contains the base of the ASID array, need to get the cpu id  */
-       LONG_L  t2, TI_CPU($28)         /* smp_processor_id */
-       INT_SLL t2, t2, 2               /* x4 */
-       REG_ADDU t3, t1, t2
-       LONG_L  k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-       li      t3, CPUINFO_SIZE/4
-       mul     t2, t2, t3              /* x sizeof(struct cpuinfo_mips)/4 */
-       LONG_L  t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
-       and     k0, k0, t2
-#else
-       andi    k0, k0, MIPS_ENTRYHI_ASID
-#endif
-       mtc0    k0, CP0_ENTRYHI
-       ehb
-
-       /* Disable RDHWR access */
-       mtc0    zero, CP0_HWRENA
-
-       .set    noat
-       /* load the guest context from VCPU and return */
-       LONG_L  $0, VCPU_R0(k1)
-       LONG_L  $1, VCPU_R1(k1)
-       LONG_L  $2, VCPU_R2(k1)
-       LONG_L  $3, VCPU_R3(k1)
-       LONG_L  $4, VCPU_R4(k1)
-       LONG_L  $5, VCPU_R5(k1)
-       LONG_L  $6, VCPU_R6(k1)
-       LONG_L  $7, VCPU_R7(k1)
-       LONG_L  $8, VCPU_R8(k1)
-       LONG_L  $9, VCPU_R9(k1)
-       LONG_L  $10, VCPU_R10(k1)
-       LONG_L  $11, VCPU_R11(k1)
-       LONG_L  $12, VCPU_R12(k1)
-       LONG_L  $13, VCPU_R13(k1)
-       LONG_L  $14, VCPU_R14(k1)
-       LONG_L  $15, VCPU_R15(k1)
-       LONG_L  $16, VCPU_R16(k1)
-       LONG_L  $17, VCPU_R17(k1)
-       LONG_L  $18, VCPU_R18(k1)
-       LONG_L  $19, VCPU_R19(k1)
-       LONG_L  $20, VCPU_R20(k1)
-       LONG_L  $21, VCPU_R21(k1)
-       LONG_L  $22, VCPU_R22(k1)
-       LONG_L  $23, VCPU_R23(k1)
-       LONG_L  $24, VCPU_R24(k1)
-       LONG_L  $25, VCPU_R25(k1)
-
-       /* $/k1 loaded later */
-       LONG_L  $28, VCPU_R28(k1)
-       LONG_L  $29, VCPU_R29(k1)
-       LONG_L  $30, VCPU_R30(k1)
-       LONG_L  $31, VCPU_R31(k1)
-
-FEXPORT(__kvm_mips_skip_guest_restore)
-       LONG_L  k0, VCPU_HI(k1)
-       mthi    k0
-
-       LONG_L  k0, VCPU_LO(k1)
-       mtlo    k0
-
-       LONG_L  k0, VCPU_R26(k1)
-       LONG_L  k1, VCPU_R27(k1)
-
-       eret
-       .set    at
-
-__kvm_mips_return_to_host:
-       /* EBASE is already pointing to Linux */
-       LONG_L  k1, VCPU_HOST_STACK(k1)
-       INT_ADDIU k1,k1, -PT_SIZE
-
-       /* Restore host DDATA_LO */
-       LONG_L  k0, PT_HOST_USERLOCAL(k1)
-       mtc0    k0, CP0_DDATA_LO
-
-       /*
-        * r2/v0 is the return code, shift it down by 2 (arithmetic)
-        * to recover the err code
-        */
-       INT_SRA k0, v0, 2
-       move    $2, k0
-
-       /* Load context saved on the host stack */
-       LONG_L  $16, PT_R16(k1)
-       LONG_L  $17, PT_R17(k1)
-       LONG_L  $18, PT_R18(k1)
-       LONG_L  $19, PT_R19(k1)
-       LONG_L  $20, PT_R20(k1)
-       LONG_L  $21, PT_R21(k1)
-       LONG_L  $22, PT_R22(k1)
-       LONG_L  $23, PT_R23(k1)
-
-       LONG_L  $28, PT_R28(k1)
-       LONG_L  $29, PT_R29(k1)
-       LONG_L  $30, PT_R30(k1)
-
-       LONG_L  k0, PT_HI(k1)
-       mthi    k0
-
-       LONG_L  k0, PT_LO(k1)
-       mtlo    k0
-
-       /* Restore RDHWR access */
-       PTR_LI  k0, 0x2000000F
-       mtc0    k0, CP0_HWRENA
-
-       /* Restore RA, which is the address we will return to */
-       LONG_L  ra, PT_R31(k1)
-       j       ra
-        nop
-
-VECTOR_END(MIPSX(GuestExceptionEnd))
-.end MIPSX(GuestException)
-
-MIPSX(exceptions):
-       ####
-       ##### The exception handlers.
-       #####
-       .word _C_LABEL(MIPSX(GuestException))   #  0
-       .word _C_LABEL(MIPSX(GuestException))   #  1
-       .word _C_LABEL(MIPSX(GuestException))   #  2
-       .word _C_LABEL(MIPSX(GuestException))   #  3
-       .word _C_LABEL(MIPSX(GuestException))   #  4
-       .word _C_LABEL(MIPSX(GuestException))   #  5
-       .word _C_LABEL(MIPSX(GuestException))   #  6
-       .word _C_LABEL(MIPSX(GuestException))   #  7
-       .word _C_LABEL(MIPSX(GuestException))   #  8
-       .word _C_LABEL(MIPSX(GuestException))   #  9
-       .word _C_LABEL(MIPSX(GuestException))   # 10
-       .word _C_LABEL(MIPSX(GuestException))   # 11
-       .word _C_LABEL(MIPSX(GuestException))   # 12
-       .word _C_LABEL(MIPSX(GuestException))   # 13
-       .word _C_LABEL(MIPSX(GuestException))   # 14
-       .word _C_LABEL(MIPSX(GuestException))   # 15
-       .word _C_LABEL(MIPSX(GuestException))   # 16
-       .word _C_LABEL(MIPSX(GuestException))   # 17
-       .word _C_LABEL(MIPSX(GuestException))   # 18
-       .word _C_LABEL(MIPSX(GuestException))   # 19
-       .word _C_LABEL(MIPSX(GuestException))   # 20
-       .word _C_LABEL(MIPSX(GuestException))   # 21
-       .word _C_LABEL(MIPSX(GuestException))   # 22
-       .word _C_LABEL(MIPSX(GuestException))   # 23
-       .word _C_LABEL(MIPSX(GuestException))   # 24
-       .word _C_LABEL(MIPSX(GuestException))   # 25
-       .word _C_LABEL(MIPSX(GuestException))   # 26
-       .word _C_LABEL(MIPSX(GuestException))   # 27
-       .word _C_LABEL(MIPSX(GuestException))   # 28
-       .word _C_LABEL(MIPSX(GuestException))   # 29
-       .word _C_LABEL(MIPSX(GuestException))   # 30
-       .word _C_LABEL(MIPSX(GuestException))   # 31
index 44da525..a6ea084 100644 (file)
@@ -9,6 +9,7 @@
  * Authors: Sanjay Lal <sanjayl@kymasys.com>
  */
 
+#include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kdebug.h>
@@ -147,7 +148,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
        /* Put the pages we reserved for the guest pmap */
        for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
                if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
-                       kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]);
+                       kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
        }
        kfree(kvm->arch.guest_pmap);
 
@@ -244,10 +245,27 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        }
 }
 
+static inline void dump_handler(const char *symbol, void *start, void *end)
+{
+       u32 *p;
+
+       pr_debug("LEAF(%s)\n", symbol);
+
+       pr_debug("\t.set push\n");
+       pr_debug("\t.set noreorder\n");
+
+       for (p = start; p < (u32 *)end; ++p)
+               pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p);
+
+       pr_debug("\t.set\tpop\n");
+
+       pr_debug("\tEND(%s)\n", symbol);
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-       int err, size, offset;
-       void *gebase;
+       int err, size;
+       void *gebase, *p, *handler;
        int i;
 
        struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -273,9 +291,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        else
                size = 0x4000;
 
-       /* Save Linux EBASE */
-       vcpu->arch.host_ebase = (void *)read_c0_ebase();
-
        gebase = kzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
 
        if (!gebase) {
@@ -285,44 +300,53 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n",
                  ALIGN(size, PAGE_SIZE), gebase);
 
+       /*
+        * Check new ebase actually fits in CP0_EBase. The lack of a write gate
+        * limits us to the low 512MB of physical address space. If the memory
+        * we allocate is out of range, just give up now.
+        */
+       if (!cpu_has_ebase_wg && virt_to_phys(gebase) >= 0x20000000) {
+               kvm_err("CP0_EBase.WG required for guest exception base %pK\n",
+                       gebase);
+               err = -ENOMEM;
+               goto out_free_gebase;
+       }
+
        /* Save new ebase */
        vcpu->arch.guest_ebase = gebase;
 
-       /* Copy L1 Guest Exception handler to correct offset */
+       /* Build guest exception vectors dynamically in unmapped memory */
+       handler = gebase + 0x2000;
 
        /* TLB Refill, EXL = 0 */
-       memcpy(gebase, mips32_exception,
-              mips32_exceptionEnd - mips32_exception);
+       kvm_mips_build_exception(gebase, handler);
 
        /* General Exception Entry point */
-       memcpy(gebase + 0x180, mips32_exception,
-              mips32_exceptionEnd - mips32_exception);
+       kvm_mips_build_exception(gebase + 0x180, handler);
 
        /* For vectored interrupts poke the exception code @ all offsets 0-7 */
        for (i = 0; i < 8; i++) {
                kvm_debug("L1 Vectored handler @ %p\n",
                          gebase + 0x200 + (i * VECTORSPACING));
-               memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception,
-                      mips32_exceptionEnd - mips32_exception);
+               kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING,
+                                        handler);
        }
 
-       /* General handler, relocate to unmapped space for sanity's sake */
-       offset = 0x2000;
-       kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n",
-                 gebase + offset,
-                 mips32_GuestExceptionEnd - mips32_GuestException);
+       /* General exit handler */
+       p = handler;
+       p = kvm_mips_build_exit(p);
 
-       memcpy(gebase + offset, mips32_GuestException,
-              mips32_GuestExceptionEnd - mips32_GuestException);
+       /* Guest entry routine */
+       vcpu->arch.vcpu_run = p;
+       p = kvm_mips_build_vcpu_run(p);
 
-#ifdef MODULE
-       offset += mips32_GuestExceptionEnd - mips32_GuestException;
-       memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run,
-              __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run);
-       vcpu->arch.vcpu_run = gebase + offset;
-#else
-       vcpu->arch.vcpu_run = __kvm_mips_vcpu_run;
-#endif
+       /* Dump the generated code */
+       pr_debug("#include <asm/asm.h>\n");
+       pr_debug("#include <asm/regdef.h>\n");
+       pr_debug("\n");
+       dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+       dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
+       dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
 
        /* Invalidate the icache for these ranges */
        local_flush_icache_range((unsigned long)gebase,
@@ -408,17 +432,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        kvm_mips_deliver_interrupts(vcpu,
                                    kvm_read_c0_guest_cause(vcpu->arch.cop0));
 
-       __kvm_guest_enter();
+       guest_enter_irqoff();
 
        /* Disable hardware page table walking while in guest */
        htw_stop();
 
+       trace_kvm_enter(vcpu);
        r = vcpu->arch.vcpu_run(run, vcpu);
+       trace_kvm_out(vcpu);
 
        /* Re-enable HTW before enabling interrupts */
        htw_start();
 
-       __kvm_guest_exit();
+       guest_exit_irqoff();
        local_irq_enable();
 
        if (vcpu->sigset_active)
@@ -507,8 +533,10 @@ static u64 kvm_mips_get_one_regs[] = {
        KVM_REG_MIPS_R30,
        KVM_REG_MIPS_R31,
 
+#ifndef CONFIG_CPU_MIPSR6
        KVM_REG_MIPS_HI,
        KVM_REG_MIPS_LO,
+#endif
        KVM_REG_MIPS_PC,
 
        KVM_REG_MIPS_CP0_INDEX,
@@ -539,6 +567,104 @@ static u64 kvm_mips_get_one_regs[] = {
        KVM_REG_MIPS_COUNT_HZ,
 };
 
+static u64 kvm_mips_get_one_regs_fpu[] = {
+       KVM_REG_MIPS_FCR_IR,
+       KVM_REG_MIPS_FCR_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_msa[] = {
+       KVM_REG_MIPS_MSA_IR,
+       KVM_REG_MIPS_MSA_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_kscratch[] = {
+       KVM_REG_MIPS_CP0_KSCRATCH1,
+       KVM_REG_MIPS_CP0_KSCRATCH2,
+       KVM_REG_MIPS_CP0_KSCRATCH3,
+       KVM_REG_MIPS_CP0_KSCRATCH4,
+       KVM_REG_MIPS_CP0_KSCRATCH5,
+       KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
+static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
+{
+       unsigned long ret;
+
+       ret = ARRAY_SIZE(kvm_mips_get_one_regs);
+       if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+               ret += ARRAY_SIZE(kvm_mips_get_one_regs_fpu) + 48;
+               /* odd doubles */
+               if (boot_cpu_data.fpu_id & MIPS_FPIR_F64)
+                       ret += 16;
+       }
+       if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+               ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
+       ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
+       ret += kvm_mips_callbacks->num_regs(vcpu);
+
+       return ret;
+}
+
+static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+       u64 index;
+       unsigned int i;
+
+       if (copy_to_user(indices, kvm_mips_get_one_regs,
+                        sizeof(kvm_mips_get_one_regs)))
+               return -EFAULT;
+       indices += ARRAY_SIZE(kvm_mips_get_one_regs);
+
+       if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+               if (copy_to_user(indices, kvm_mips_get_one_regs_fpu,
+                                sizeof(kvm_mips_get_one_regs_fpu)))
+                       return -EFAULT;
+               indices += ARRAY_SIZE(kvm_mips_get_one_regs_fpu);
+
+               for (i = 0; i < 32; ++i) {
+                       index = KVM_REG_MIPS_FPR_32(i);
+                       if (copy_to_user(indices, &index, sizeof(index)))
+                               return -EFAULT;
+                       ++indices;
+
+                       /* skip odd doubles if no F64 */
+                       if (i & 1 && !(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+                               continue;
+
+                       index = KVM_REG_MIPS_FPR_64(i);
+                       if (copy_to_user(indices, &index, sizeof(index)))
+                               return -EFAULT;
+                       ++indices;
+               }
+       }
+
+       if (kvm_mips_guest_can_have_msa(&vcpu->arch)) {
+               if (copy_to_user(indices, kvm_mips_get_one_regs_msa,
+                                sizeof(kvm_mips_get_one_regs_msa)))
+                       return -EFAULT;
+               indices += ARRAY_SIZE(kvm_mips_get_one_regs_msa);
+
+               for (i = 0; i < 32; ++i) {
+                       index = KVM_REG_MIPS_VEC_128(i);
+                       if (copy_to_user(indices, &index, sizeof(index)))
+                               return -EFAULT;
+                       ++indices;
+               }
+       }
+
+       for (i = 0; i < 6; ++i) {
+               if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
+                       continue;
+
+               if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
+                                sizeof(kvm_mips_get_one_regs_kscratch[i])))
+                       return -EFAULT;
+               ++indices;
+       }
+
+       return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
+}
+
 static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
                            const struct kvm_one_reg *reg)
 {
@@ -554,12 +680,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31:
                v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0];
                break;
+#ifndef CONFIG_CPU_MIPSR6
        case KVM_REG_MIPS_HI:
                v = (long)vcpu->arch.hi;
                break;
        case KVM_REG_MIPS_LO:
                v = (long)vcpu->arch.lo;
                break;
+#endif
        case KVM_REG_MIPS_PC:
                v = (long)vcpu->arch.pc;
                break;
@@ -688,17 +816,37 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_ERROREPC:
                v = (long)kvm_read_c0_guest_errorepc(cop0);
                break;
+       case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+               idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+               if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+                       return -EINVAL;
+               switch (idx) {
+               case 2:
+                       v = (long)kvm_read_c0_guest_kscratch1(cop0);
+                       break;
+               case 3:
+                       v = (long)kvm_read_c0_guest_kscratch2(cop0);
+                       break;
+               case 4:
+                       v = (long)kvm_read_c0_guest_kscratch3(cop0);
+                       break;
+               case 5:
+                       v = (long)kvm_read_c0_guest_kscratch4(cop0);
+                       break;
+               case 6:
+                       v = (long)kvm_read_c0_guest_kscratch5(cop0);
+                       break;
+               case 7:
+                       v = (long)kvm_read_c0_guest_kscratch6(cop0);
+                       break;
+               }
+               break;
        /* registers to be handled specially */
-       case KVM_REG_MIPS_CP0_COUNT:
-       case KVM_REG_MIPS_COUNT_CTL:
-       case KVM_REG_MIPS_COUNT_RESUME:
-       case KVM_REG_MIPS_COUNT_HZ:
+       default:
                ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
                if (ret)
                        return ret;
                break;
-       default:
-               return -EINVAL;
        }
        if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64) {
                u64 __user *uaddr64 = (u64 __user *)(long)reg->addr;
@@ -755,12 +903,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31:
                vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v;
                break;
+#ifndef CONFIG_CPU_MIPSR6
        case KVM_REG_MIPS_HI:
                vcpu->arch.hi = v;
                break;
        case KVM_REG_MIPS_LO:
                vcpu->arch.lo = v;
                break;
+#endif
        case KVM_REG_MIPS_PC:
                vcpu->arch.pc = v;
                break;
@@ -859,22 +1009,34 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_ERROREPC:
                kvm_write_c0_guest_errorepc(cop0, v);
                break;
+       case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+               idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+               if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+                       return -EINVAL;
+               switch (idx) {
+               case 2:
+                       kvm_write_c0_guest_kscratch1(cop0, v);
+                       break;
+               case 3:
+                       kvm_write_c0_guest_kscratch2(cop0, v);
+                       break;
+               case 4:
+                       kvm_write_c0_guest_kscratch3(cop0, v);
+                       break;
+               case 5:
+                       kvm_write_c0_guest_kscratch4(cop0, v);
+                       break;
+               case 6:
+                       kvm_write_c0_guest_kscratch5(cop0, v);
+                       break;
+               case 7:
+                       kvm_write_c0_guest_kscratch6(cop0, v);
+                       break;
+               }
+               break;
        /* registers to be handled specially */
-       case KVM_REG_MIPS_CP0_COUNT:
-       case KVM_REG_MIPS_CP0_COMPARE:
-       case KVM_REG_MIPS_CP0_CAUSE:
-       case KVM_REG_MIPS_CP0_CONFIG:
-       case KVM_REG_MIPS_CP0_CONFIG1:
-       case KVM_REG_MIPS_CP0_CONFIG2:
-       case KVM_REG_MIPS_CP0_CONFIG3:
-       case KVM_REG_MIPS_CP0_CONFIG4:
-       case KVM_REG_MIPS_CP0_CONFIG5:
-       case KVM_REG_MIPS_COUNT_CTL:
-       case KVM_REG_MIPS_COUNT_RESUME:
-       case KVM_REG_MIPS_COUNT_HZ:
-               return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
        default:
-               return -EINVAL;
+               return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
        }
        return 0;
 }
@@ -927,23 +1089,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
        }
        case KVM_GET_REG_LIST: {
                struct kvm_reg_list __user *user_list = argp;
-               u64 __user *reg_dest;
                struct kvm_reg_list reg_list;
                unsigned n;
 
                if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
                        return -EFAULT;
                n = reg_list.n;
-               reg_list.n = ARRAY_SIZE(kvm_mips_get_one_regs);
+               reg_list.n = kvm_mips_num_regs(vcpu);
                if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
                        return -EFAULT;
                if (n < reg_list.n)
                        return -E2BIG;
-               reg_dest = user_list->reg;
-               if (copy_to_user(reg_dest, kvm_mips_get_one_regs,
-                                sizeof(kvm_mips_get_one_regs)))
-                       return -EFAULT;
-               return 0;
+               return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
        }
        case KVM_NMI:
                /* Treat the NMI as a CPU reset */
@@ -1222,7 +1379,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 static void kvm_mips_set_c0_status(void)
 {
-       uint32_t status = read_c0_status();
+       u32 status = read_c0_status();
 
        if (cpu_has_dsp)
                status |= (ST0_MX);
@@ -1236,9 +1393,9 @@ static void kvm_mips_set_c0_status(void)
  */
 int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
-       uint32_t cause = vcpu->arch.host_cp0_cause;
-       uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
+       u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
@@ -1260,6 +1417,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
        kvm_debug("kvm_mips_handle_exit: cause: %#x, PC: %p, kvm_run: %p, kvm_vcpu: %p\n",
                        cause, opc, run, vcpu);
+       trace_kvm_exit(vcpu, exccode);
 
        /*
         * Do a privilege check, if in UM most of these exit conditions end up
@@ -1279,7 +1437,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                kvm_debug("[%d]EXCCODE_INT @ %p\n", vcpu->vcpu_id, opc);
 
                ++vcpu->stat.int_exits;
-               trace_kvm_exit(vcpu, INT_EXITS);
 
                if (need_resched())
                        cond_resched();
@@ -1291,7 +1448,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                kvm_debug("EXCCODE_CPU: @ PC: %p\n", opc);
 
                ++vcpu->stat.cop_unusable_exits;
-               trace_kvm_exit(vcpu, COP_UNUSABLE_EXITS);
                ret = kvm_mips_callbacks->handle_cop_unusable(vcpu);
                /* XXXKYMA: Might need to return to user space */
                if (run->exit_reason == KVM_EXIT_IRQ_WINDOW_OPEN)
@@ -1300,7 +1456,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
        case EXCCODE_MOD:
                ++vcpu->stat.tlbmod_exits;
-               trace_kvm_exit(vcpu, TLBMOD_EXITS);
                ret = kvm_mips_callbacks->handle_tlb_mod(vcpu);
                break;
 
@@ -1310,7 +1465,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                          badvaddr);
 
                ++vcpu->stat.tlbmiss_st_exits;
-               trace_kvm_exit(vcpu, TLBMISS_ST_EXITS);
                ret = kvm_mips_callbacks->handle_tlb_st_miss(vcpu);
                break;
 
@@ -1319,61 +1473,51 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                          cause, opc, badvaddr);
 
                ++vcpu->stat.tlbmiss_ld_exits;
-               trace_kvm_exit(vcpu, TLBMISS_LD_EXITS);
                ret = kvm_mips_callbacks->handle_tlb_ld_miss(vcpu);
                break;
 
        case EXCCODE_ADES:
                ++vcpu->stat.addrerr_st_exits;
-               trace_kvm_exit(vcpu, ADDRERR_ST_EXITS);
                ret = kvm_mips_callbacks->handle_addr_err_st(vcpu);
                break;
 
        case EXCCODE_ADEL:
                ++vcpu->stat.addrerr_ld_exits;
-               trace_kvm_exit(vcpu, ADDRERR_LD_EXITS);
                ret = kvm_mips_callbacks->handle_addr_err_ld(vcpu);
                break;
 
        case EXCCODE_SYS:
                ++vcpu->stat.syscall_exits;
-               trace_kvm_exit(vcpu, SYSCALL_EXITS);
                ret = kvm_mips_callbacks->handle_syscall(vcpu);
                break;
 
        case EXCCODE_RI:
                ++vcpu->stat.resvd_inst_exits;
-               trace_kvm_exit(vcpu, RESVD_INST_EXITS);
                ret = kvm_mips_callbacks->handle_res_inst(vcpu);
                break;
 
        case EXCCODE_BP:
                ++vcpu->stat.break_inst_exits;
-               trace_kvm_exit(vcpu, BREAK_INST_EXITS);
                ret = kvm_mips_callbacks->handle_break(vcpu);
                break;
 
        case EXCCODE_TR:
                ++vcpu->stat.trap_inst_exits;
-               trace_kvm_exit(vcpu, TRAP_INST_EXITS);
                ret = kvm_mips_callbacks->handle_trap(vcpu);
                break;
 
        case EXCCODE_MSAFPE:
                ++vcpu->stat.msa_fpe_exits;
-               trace_kvm_exit(vcpu, MSA_FPE_EXITS);
                ret = kvm_mips_callbacks->handle_msa_fpe(vcpu);
                break;
 
        case EXCCODE_FPE:
                ++vcpu->stat.fpe_exits;
-               trace_kvm_exit(vcpu, FPE_EXITS);
                ret = kvm_mips_callbacks->handle_fpe(vcpu);
                break;
 
        case EXCCODE_MSADIS:
                ++vcpu->stat.msa_disabled_exits;
-               trace_kvm_exit(vcpu, MSA_DISABLED_EXITS);
                ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
                break;
 
@@ -1400,11 +1544,13 @@ skip_emul:
                        run->exit_reason = KVM_EXIT_INTR;
                        ret = (-EINTR << 2) | RESUME_HOST;
                        ++vcpu->stat.signal_exits;
-                       trace_kvm_exit(vcpu, SIGNAL_EXITS);
+                       trace_kvm_exit(vcpu, KVM_TRACE_EXIT_SIGNAL);
                }
        }
 
        if (ret == RESUME_GUEST) {
+               trace_kvm_reenter(vcpu);
+
                /*
                 * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
                 * is live), restore FCR31 / MSACSR.
@@ -1450,7 +1596,7 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
         * not to clobber the status register directly via the commpage.
         */
        if (cpu_has_msa && sr & ST0_CU1 && !(sr & ST0_FR) &&
-           vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+           vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
                kvm_lose_fpu(vcpu);
 
        /*
@@ -1465,9 +1611,12 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
        enable_fpu_hazard();
 
        /* If guest FPU state not active, restore it now */
-       if (!(vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)) {
+       if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
                __kvm_restore_fpu(&vcpu->arch);
-               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+               vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU);
+       } else {
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_FPU);
        }
 
        preempt_enable();
@@ -1494,8 +1643,8 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
                 * interacts with MSA state, so play it safe and save it first.
                 */
                if (!(sr & ST0_FR) &&
-                   (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU |
-                               KVM_MIPS_FPU_MSA)) == KVM_MIPS_FPU_FPU)
+                   (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU |
+                               KVM_MIPS_AUX_MSA)) == KVM_MIPS_AUX_FPU)
                        kvm_lose_fpu(vcpu);
 
                change_c0_status(ST0_CU1 | ST0_FR, sr);
@@ -1509,22 +1658,26 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
        set_c0_config5(MIPS_CONF5_MSAEN);
        enable_fpu_hazard();
 
-       switch (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA)) {
-       case KVM_MIPS_FPU_FPU:
+       switch (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA)) {
+       case KVM_MIPS_AUX_FPU:
                /*
                 * Guest FPU state already loaded, only restore upper MSA state
                 */
                __kvm_restore_msa_upper(&vcpu->arch);
-               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+               vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_MSA);
                break;
        case 0:
                /* Neither FPU or MSA already active, restore full MSA state */
                __kvm_restore_msa(&vcpu->arch);
-               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+               vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
                if (kvm_mips_guest_has_fpu(&vcpu->arch))
-                       vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+                       vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE,
+                             KVM_TRACE_AUX_FPU_MSA);
                break;
        default:
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_MSA);
                break;
        }
 
@@ -1536,13 +1689,15 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
 void kvm_drop_fpu(struct kvm_vcpu *vcpu)
 {
        preempt_disable();
-       if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+       if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
                disable_msa();
-               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_MSA;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_MSA);
+               vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA;
        }
-       if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+       if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
                clear_c0_status(ST0_CU1 | ST0_FR);
-               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_FPU);
+               vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
        }
        preempt_enable();
 }
@@ -1558,25 +1713,27 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
         */
 
        preempt_disable();
-       if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+       if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
                set_c0_config5(MIPS_CONF5_MSAEN);
                enable_fpu_hazard();
 
                __kvm_save_msa(&vcpu->arch);
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA);
 
                /* Disable MSA & FPU */
                disable_msa();
-               if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+               if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
                        clear_c0_status(ST0_CU1 | ST0_FR);
                        disable_fpu_hazard();
                }
-               vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA);
-       } else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+               vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA);
+       } else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
                set_c0_status(ST0_CU1);
                enable_fpu_hazard();
 
                __kvm_save_fpu(&vcpu->arch);
-               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+               vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU);
 
                /* Disable FPU */
                clear_c0_status(ST0_CU1 | ST0_FR);
@@ -1638,6 +1795,10 @@ static int __init kvm_mips_init(void)
 {
        int ret;
 
+       ret = kvm_mips_entry_setup();
+       if (ret)
+               return ret;
+
        ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
 
        if (ret)
@@ -1645,18 +1806,6 @@ static int __init kvm_mips_init(void)
 
        register_die_notifier(&kvm_mips_csr_die_notifier);
 
-       /*
-        * On MIPS, kernel modules are executed from "mapped space", which
-        * requires TLBs. The TLB handling code is statically linked with
-        * the rest of the kernel (tlb.c) to avoid the possibility of
-        * double faulting. The issue is that the TLB code references
-        * routines that are part of the the KVM module, which are only
-        * available once the module is loaded.
-        */
-       kvm_mips_gfn_to_pfn = gfn_to_pfn;
-       kvm_mips_release_pfn_clean = kvm_release_pfn_clean;
-       kvm_mips_is_error_pfn = is_error_pfn;
-
        return 0;
 }
 
@@ -1664,10 +1813,6 @@ static void __exit kvm_mips_exit(void)
 {
        kvm_exit();
 
-       kvm_mips_gfn_to_pfn = NULL;
-       kvm_mips_release_pfn_clean = NULL;
-       kvm_mips_is_error_pfn = NULL;
-
        unregister_die_notifier(&kvm_mips_csr_die_notifier);
 }
 
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
new file mode 100644 (file)
index 0000000..57319ee
--- /dev/null
@@ -0,0 +1,375 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS MMU handling in the KVM module.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ */
+
+#include <linux/highmem.h>
+#include <linux/kvm_host.h>
+#include <asm/mmu_context.h>
+
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+{
+       int cpu = smp_processor_id();
+
+       return vcpu->arch.guest_kernel_asid[cpu] &
+                       cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+{
+       int cpu = smp_processor_id();
+
+       return vcpu->arch.guest_user_asid[cpu] &
+                       cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
+{
+       int srcu_idx, err = 0;
+       kvm_pfn_t pfn;
+
+       if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
+               return 0;
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       pfn = gfn_to_pfn(kvm, gfn);
+
+       if (is_error_pfn(pfn)) {
+               kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
+               err = -EFAULT;
+               goto out;
+       }
+
+       kvm->arch.guest_pmap[gfn] = pfn;
+out:
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+       return err;
+}
+
+/* Translate guest KSEG0 addresses to Host PA */
+unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
+                                                   unsigned long gva)
+{
+       gfn_t gfn;
+       unsigned long offset = gva & ~PAGE_MASK;
+       struct kvm *kvm = vcpu->kvm;
+
+       if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
+               kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
+                       __builtin_return_address(0), gva);
+               return KVM_INVALID_PAGE;
+       }
+
+       gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
+
+       if (gfn >= kvm->arch.guest_pmap_npages) {
+               kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
+                       gva);
+               return KVM_INVALID_PAGE;
+       }
+
+       if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+               return KVM_INVALID_ADDR;
+
+       return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
+}
+
+/* XXXKYMA: Must be called with interrupts disabled */
+int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
+                                   struct kvm_vcpu *vcpu)
+{
+       gfn_t gfn;
+       kvm_pfn_t pfn0, pfn1;
+       unsigned long vaddr = 0;
+       unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+       struct kvm *kvm = vcpu->kvm;
+       const int flush_dcache_mask = 0;
+       int ret;
+
+       if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
+               kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
+               kvm_mips_dump_host_tlbs();
+               return -1;
+       }
+
+       gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
+       if (gfn >= kvm->arch.guest_pmap_npages) {
+               kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
+                       gfn, badvaddr);
+               kvm_mips_dump_host_tlbs();
+               return -1;
+       }
+       vaddr = badvaddr & (PAGE_MASK << 1);
+
+       if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+               return -1;
+
+       if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
+               return -1;
+
+       pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
+       pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
+
+       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               ENTRYLO_D | ENTRYLO_V;
+       entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               ENTRYLO_D | ENTRYLO_V;
+
+       preempt_disable();
+       entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
+       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+                                     flush_dcache_mask);
+       preempt_enable();
+
+       return ret;
+}
+
+int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
+                                        struct kvm_mips_tlb *tlb)
+{
+       unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+       struct kvm *kvm = vcpu->kvm;
+       kvm_pfn_t pfn0, pfn1;
+       int ret;
+
+       if ((tlb->tlb_hi & VPN2_MASK) == 0) {
+               pfn0 = 0;
+               pfn1 = 0;
+       } else {
+               if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0])
+                                          >> PAGE_SHIFT) < 0)
+                       return -1;
+
+               if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1])
+                                          >> PAGE_SHIFT) < 0)
+                       return -1;
+
+               pfn0 = kvm->arch.guest_pmap[
+                       mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT];
+               pfn1 = kvm->arch.guest_pmap[
+                       mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT];
+       }
+
+       /* Get attributes from the Guest TLB */
+       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               (tlb->tlb_lo[0] & ENTRYLO_D) |
+               (tlb->tlb_lo[0] & ENTRYLO_V);
+       entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               (tlb->tlb_lo[1] & ENTRYLO_D) |
+               (tlb->tlb_lo[1] & ENTRYLO_V);
+
+       kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
+                 tlb->tlb_lo[0], tlb->tlb_lo[1]);
+
+       preempt_disable();
+       entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
+                                              kvm_mips_get_kernel_asid(vcpu) :
+                                              kvm_mips_get_user_asid(vcpu));
+       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+                                     tlb->tlb_mask);
+       preempt_enable();
+
+       return ret;
+}
+
+void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
+                            struct kvm_vcpu *vcpu)
+{
+       unsigned long asid = asid_cache(cpu);
+
+       asid += cpu_asid_inc();
+       if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
+               if (cpu_has_vtag_icache)
+                       flush_icache_all();
+
+               kvm_local_flush_tlb_all();      /* start new asid cycle */
+
+               if (!asid)      /* fix version if needed */
+                       asid = asid_first_version(cpu);
+       }
+
+       cpu_context(cpu, mm) = asid_cache(cpu) = asid;
+}
+
+/**
+ * kvm_mips_migrate_count() - Migrate timer.
+ * @vcpu:      Virtual CPU.
+ *
+ * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
+ * if it was running prior to being cancelled.
+ *
+ * Must be called when the VCPU is migrated to a different CPU to ensure that
+ * timer expiry during guest execution interrupts the guest and causes the
+ * interrupt to be delivered in a timely manner.
+ */
+static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
+{
+       if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
+               hrtimer_restart(&vcpu->arch.comparecount_timer);
+}
+
+/* Restore ASID once we are scheduled back after preemption */
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
+       unsigned long flags;
+       int newasid = 0;
+
+       kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
+
+       /* Allocate new kernel and user ASIDs if needed */
+
+       local_irq_save(flags);
+
+       if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
+                                               asid_version_mask(cpu)) {
+               kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
+               vcpu->arch.guest_kernel_asid[cpu] =
+                   vcpu->arch.guest_kernel_mm.context.asid[cpu];
+               kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
+               vcpu->arch.guest_user_asid[cpu] =
+                   vcpu->arch.guest_user_mm.context.asid[cpu];
+               newasid++;
+
+               kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+                         cpu_context(cpu, current->mm));
+               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
+               kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
+                         vcpu->arch.guest_user_asid[cpu]);
+       }
+
+       if (vcpu->arch.last_sched_cpu != cpu) {
+               kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
+                         vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
+               /*
+                * Migrate the timer interrupt to the current CPU so that it
+                * always interrupts the guest and synchronously triggers a
+                * guest timer interrupt.
+                */
+               kvm_mips_migrate_count(vcpu);
+       }
+
+       if (!newasid) {
+               /*
+                * If we preempted while the guest was executing, then reload
+                * the pre-empted ASID
+                */
+               if (current->flags & PF_VCPU) {
+                       write_c0_entryhi(vcpu->arch.
+                                        preempt_entryhi & asid_mask);
+                       ehb();
+               }
+       } else {
+               /* New ASIDs were allocated for the VM */
+
+               /*
+                * Were we in guest context? If so then the pre-empted ASID is
+                * no longer valid, we need to set it to what it should be based
+                * on the mode of the Guest (Kernel/User)
+                */
+               if (current->flags & PF_VCPU) {
+                       if (KVM_GUEST_KERNEL_MODE(vcpu))
+                               write_c0_entryhi(vcpu->arch.
+                                                guest_kernel_asid[cpu] &
+                                                asid_mask);
+                       else
+                               write_c0_entryhi(vcpu->arch.
+                                                guest_user_asid[cpu] &
+                                                asid_mask);
+                       ehb();
+               }
+       }
+
+       /* restore guest state to registers */
+       kvm_mips_callbacks->vcpu_set_regs(vcpu);
+
+       local_irq_restore(flags);
+
+}
+
+/* ASID can change if another task is scheduled during preemption */
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       unsigned long flags;
+       int cpu;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+
+       vcpu->arch.preempt_entryhi = read_c0_entryhi();
+       vcpu->arch.last_sched_cpu = cpu;
+
+       /* save guest state in registers */
+       kvm_mips_callbacks->vcpu_get_regs(vcpu);
+
+       if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+            asid_version_mask(cpu))) {
+               kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
+                         cpu_context(cpu, current->mm));
+               drop_mmu_context(current->mm, cpu);
+       }
+       write_c0_entryhi(cpu_asid(cpu, current->mm));
+       ehb();
+
+       local_irq_restore(flags);
+}
+
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       unsigned long paddr, flags, vpn2, asid;
+       unsigned long va = (unsigned long)opc;
+       void *vaddr;
+       u32 inst;
+       int index;
+
+       if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 ||
+           KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
+               local_irq_save(flags);
+               index = kvm_mips_host_tlb_lookup(vcpu, va);
+               if (index >= 0) {
+                       inst = *(opc);
+               } else {
+                       vpn2 = va & VPN2_MASK;
+                       asid = kvm_read_c0_guest_entryhi(cop0) &
+                                               KVM_ENTRYHI_ASID;
+                       index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
+                       if (index < 0) {
+                               kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
+                                       __func__, opc, vcpu, read_c0_entryhi());
+                               kvm_mips_dump_host_tlbs();
+                               kvm_mips_dump_guest_tlbs(vcpu);
+                               local_irq_restore(flags);
+                               return KVM_INVALID_INST;
+                       }
+                       kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
+                                                            &vcpu->arch.
+                                                            guest_tlb[index]);
+                       inst = *(opc);
+               }
+               local_irq_restore(flags);
+       } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
+               paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
+               vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+               vaddr += paddr & ~PAGE_MASK;
+               inst = *(u32 *)vaddr;
+               kunmap_atomic(vaddr);
+       } else {
+               kvm_err("%s: illegal address: %p\n", __func__, opc);
+               return KVM_INVALID_INST;
+       }
+
+       return inst;
+}
index 888bb67..53f851a 100644 (file)
 
 #include <linux/kvm_host.h>
 
-char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = {
-       "WAIT",
-       "CACHE",
-       "Signal",
-       "Interrupt",
-       "COP0/1 Unusable",
-       "TLB Mod",
-       "TLB Miss (LD)",
-       "TLB Miss (ST)",
-       "Address Err (ST)",
-       "Address Error (LD)",
-       "System Call",
-       "Reserved Inst",
-       "Break Inst",
-       "Trap Inst",
-       "MSA FPE",
-       "FPE",
-       "MSA Disabled",
-       "D-Cache Flushes",
-};
-
 char *kvm_cop0_str[N_MIPS_COPROC_REGS] = {
        "Index",
        "Random",
index ed021ae..254377d 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/smp.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kvm_host.h>
 #include <linux/srcu.h>
 
@@ -24,6 +24,7 @@
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
+#include <asm/tlbdebug.h>
 
 #undef CONFIG_MIPS_MT
 #include <asm/r4kcache.h>
 #define KVM_GUEST_PC_TLB    0
 #define KVM_GUEST_SP_TLB    1
 
-#define PRIx64 "llx"
-
 atomic_t kvm_mips_instance;
 EXPORT_SYMBOL_GPL(kvm_mips_instance);
 
-/* These function pointers are initialized once the KVM module is loaded */
-kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-EXPORT_SYMBOL_GPL(kvm_mips_gfn_to_pfn);
-
-void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean);
-
-bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn);
-
-uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
        int cpu = smp_processor_id();
 
@@ -55,7 +44,7 @@ uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
                        cpu_asid_mask(&cpu_data[cpu]);
 }
 
-uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 {
        int cpu = smp_processor_id();
 
@@ -63,7 +52,7 @@ uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
                        cpu_asid_mask(&cpu_data[cpu]);
 }
 
-inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
+inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
 {
        return vcpu->kvm->arch.commpage_tlb;
 }
@@ -72,50 +61,15 @@ inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
 
 void kvm_mips_dump_host_tlbs(void)
 {
-       unsigned long old_entryhi;
-       unsigned long old_pagemask;
-       struct kvm_mips_tlb tlb;
        unsigned long flags;
-       int i;
 
        local_irq_save(flags);
 
-       old_entryhi = read_c0_entryhi();
-       old_pagemask = read_c0_pagemask();
-
        kvm_info("HOST TLBs:\n");
-       kvm_info("ASID: %#lx\n", read_c0_entryhi() &
-                cpu_asid_mask(&current_cpu_data));
-
-       for (i = 0; i < current_cpu_data.tlbsize; i++) {
-               write_c0_index(i);
-               mtc0_tlbw_hazard();
-
-               tlb_read();
-               tlbw_use_hazard();
+       dump_tlb_regs();
+       pr_info("\n");
+       dump_tlb_all();
 
-               tlb.tlb_hi = read_c0_entryhi();
-               tlb.tlb_lo0 = read_c0_entrylo0();
-               tlb.tlb_lo1 = read_c0_entrylo1();
-               tlb.tlb_mask = read_c0_pagemask();
-
-               kvm_info("TLB%c%3d Hi 0x%08lx ",
-                        (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
-                        i, tlb.tlb_hi);
-               kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
-                        (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
-                        (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
-                        (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
-                        (tlb.tlb_lo0 >> 3) & 7);
-               kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
-                        (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
-                        (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
-                        (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
-                        (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
-       }
-       write_c0_entryhi(old_entryhi);
-       write_c0_pagemask(old_pagemask);
-       mtc0_tlbw_hazard();
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_host_tlbs);
@@ -132,74 +86,24 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
        for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) {
                tlb = vcpu->arch.guest_tlb[i];
                kvm_info("TLB%c%3d Hi 0x%08lx ",
-                        (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
+                        (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & ENTRYLO_V
+                                                       ? ' ' : '*',
                         i, tlb.tlb_hi);
-               kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
-                        (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
-                        (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
-                        (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
-                        (tlb.tlb_lo0 >> 3) & 7);
-               kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
-                        (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
-                        (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
-                        (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
-                        (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
+               kvm_info("Lo0=0x%09llx %c%c attr %lx ",
+                        (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]),
+                        (tlb.tlb_lo[0] & ENTRYLO_D) ? 'D' : ' ',
+                        (tlb.tlb_lo[0] & ENTRYLO_G) ? 'G' : ' ',
+                        (tlb.tlb_lo[0] & ENTRYLO_C) >> ENTRYLO_C_SHIFT);
+               kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
+                        (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]),
+                        (tlb.tlb_lo[1] & ENTRYLO_D) ? 'D' : ' ',
+                        (tlb.tlb_lo[1] & ENTRYLO_G) ? 'G' : ' ',
+                        (tlb.tlb_lo[1] & ENTRYLO_C) >> ENTRYLO_C_SHIFT,
+                        tlb.tlb_mask);
        }
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
 
-static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
-{
-       int srcu_idx, err = 0;
-       kvm_pfn_t pfn;
-
-       if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
-               return 0;
-
-       srcu_idx = srcu_read_lock(&kvm->srcu);
-       pfn = kvm_mips_gfn_to_pfn(kvm, gfn);
-
-       if (kvm_mips_is_error_pfn(pfn)) {
-               kvm_err("Couldn't get pfn for gfn %#" PRIx64 "!\n", gfn);
-               err = -EFAULT;
-               goto out;
-       }
-
-       kvm->arch.guest_pmap[gfn] = pfn;
-out:
-       srcu_read_unlock(&kvm->srcu, srcu_idx);
-       return err;
-}
-
-/* Translate guest KSEG0 addresses to Host PA */
-unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
-                                                   unsigned long gva)
-{
-       gfn_t gfn;
-       uint32_t offset = gva & ~PAGE_MASK;
-       struct kvm *kvm = vcpu->kvm;
-
-       if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
-               kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
-                       __builtin_return_address(0), gva);
-               return KVM_INVALID_PAGE;
-       }
-
-       gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
-
-       if (gfn >= kvm->arch.guest_pmap_npages) {
-               kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
-                       gva);
-               return KVM_INVALID_PAGE;
-       }
-
-       if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
-               return KVM_INVALID_ADDR;
-
-       return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_translate_guest_kseg0_to_hpa);
-
 /* XXXKYMA: Must be called with interrupts disabled */
 /* set flush_dcache_mask == 0 if no dcache flush required */
 int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
@@ -243,12 +147,12 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
 
        /* Flush D-cache */
        if (flush_dcache_mask) {
-               if (entrylo0 & MIPS3_PG_V) {
+               if (entrylo0 & ENTRYLO_V) {
                        ++vcpu->stat.flush_dcache_exits;
                        flush_data_cache_page((entryhi & VPN2_MASK) &
                                              ~flush_dcache_mask);
                }
-               if (entrylo1 & MIPS3_PG_V) {
+               if (entrylo1 & ENTRYLO_V) {
                        ++vcpu->stat.flush_dcache_exits;
                        flush_data_cache_page(((entryhi & VPN2_MASK) &
                                               ~flush_dcache_mask) |
@@ -259,96 +163,35 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
        /* Restore old ASID */
        write_c0_entryhi(old_entryhi);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
        local_irq_restore(flags);
        return 0;
 }
-
-/* XXXKYMA: Must be called with interrupts disabled */
-int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
-                                   struct kvm_vcpu *vcpu)
-{
-       gfn_t gfn;
-       kvm_pfn_t pfn0, pfn1;
-       unsigned long vaddr = 0;
-       unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
-       int even;
-       struct kvm *kvm = vcpu->kvm;
-       const int flush_dcache_mask = 0;
-       int ret;
-
-       if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
-               kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
-               kvm_mips_dump_host_tlbs();
-               return -1;
-       }
-
-       gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
-       if (gfn >= kvm->arch.guest_pmap_npages) {
-               kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
-                       gfn, badvaddr);
-               kvm_mips_dump_host_tlbs();
-               return -1;
-       }
-       even = !(gfn & 0x1);
-       vaddr = badvaddr & (PAGE_MASK << 1);
-
-       if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
-               return -1;
-
-       if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
-               return -1;
-
-       if (even) {
-               pfn0 = kvm->arch.guest_pmap[gfn];
-               pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1];
-       } else {
-               pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1];
-               pfn1 = kvm->arch.guest_pmap[gfn];
-       }
-
-       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-                  (1 << 2) | (0x1 << 1);
-       entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-                  (1 << 2) | (0x1 << 1);
-
-       preempt_disable();
-       entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
-       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-                                     flush_dcache_mask);
-       preempt_enable();
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault);
+EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
 
 int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
        struct kvm_vcpu *vcpu)
 {
-       kvm_pfn_t pfn0, pfn1;
+       kvm_pfn_t pfn;
        unsigned long flags, old_entryhi = 0, vaddr = 0;
-       unsigned long entrylo0 = 0, entrylo1 = 0;
+       unsigned long entrylo[2] = { 0, 0 };
+       unsigned int pair_idx;
 
-       pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
-       pfn1 = 0;
-       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-                  (1 << 2) | (0x1 << 1);
-       entrylo1 = 0;
+       pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
+       pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
+       entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               ENTRYLO_D | ENTRYLO_V;
 
        local_irq_save(flags);
 
        old_entryhi = read_c0_entryhi();
        vaddr = badvaddr & (PAGE_MASK << 1);
        write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
-       mtc0_tlbw_hazard();
-       write_c0_entrylo0(entrylo0);
-       mtc0_tlbw_hazard();
-       write_c0_entrylo1(entrylo1);
-       mtc0_tlbw_hazard();
+       write_c0_entrylo0(entrylo[0]);
+       write_c0_entrylo1(entrylo[1]);
        write_c0_index(kvm_mips_get_commpage_asid(vcpu));
        mtc0_tlbw_hazard();
        tlb_write_indexed();
-       mtc0_tlbw_hazard();
        tlbw_use_hazard();
 
        kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
@@ -358,68 +201,12 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
        /* Restore old ASID */
        write_c0_entryhi(old_entryhi);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
        local_irq_restore(flags);
 
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
 
-int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-                                        struct kvm_mips_tlb *tlb,
-                                        unsigned long *hpa0,
-                                        unsigned long *hpa1)
-{
-       unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
-       struct kvm *kvm = vcpu->kvm;
-       kvm_pfn_t pfn0, pfn1;
-       int ret;
-
-       if ((tlb->tlb_hi & VPN2_MASK) == 0) {
-               pfn0 = 0;
-               pfn1 = 0;
-       } else {
-               if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
-                                          >> PAGE_SHIFT) < 0)
-                       return -1;
-
-               if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
-                                          >> PAGE_SHIFT) < 0)
-                       return -1;
-
-               pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
-                                           >> PAGE_SHIFT];
-               pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
-                                           >> PAGE_SHIFT];
-       }
-
-       if (hpa0)
-               *hpa0 = pfn0 << PAGE_SHIFT;
-
-       if (hpa1)
-               *hpa1 = pfn1 << PAGE_SHIFT;
-
-       /* Get attributes from the Guest TLB */
-       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-                  (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
-       entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-                  (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
-
-       kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
-                 tlb->tlb_lo0, tlb->tlb_lo1);
-
-       preempt_disable();
-       entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
-                                              kvm_mips_get_kernel_asid(vcpu) :
-                                              kvm_mips_get_user_asid(vcpu));
-       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-                                     tlb->tlb_mask);
-       preempt_enable();
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault);
-
 int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
 {
        int i;
@@ -435,7 +222,7 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
        }
 
        kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n",
-                 __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1);
+                 __func__, entryhi, index, tlb[i].tlb_lo[0], tlb[i].tlb_lo[1]);
 
        return index;
 }
@@ -467,7 +254,6 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
        /* Restore old ASID */
        write_c0_entryhi(old_entryhi);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
 
        local_irq_restore(flags);
 
@@ -498,21 +284,16 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 
        if (idx > 0) {
                write_c0_entryhi(UNIQUE_ENTRYHI(idx));
-               mtc0_tlbw_hazard();
-
                write_c0_entrylo0(0);
-               mtc0_tlbw_hazard();
-
                write_c0_entrylo1(0);
                mtc0_tlbw_hazard();
 
                tlb_write_indexed();
-               mtc0_tlbw_hazard();
+               tlbw_use_hazard();
        }
 
        write_c0_entryhi(old_entryhi);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
 
        local_irq_restore(flags);
 
@@ -540,61 +321,39 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
        /* Blast 'em all away. */
        for (entry = 0; entry < maxentry; entry++) {
                write_c0_index(entry);
-               mtc0_tlbw_hazard();
 
                if (skip_kseg0) {
+                       mtc0_tlbr_hazard();
                        tlb_read();
-                       tlbw_use_hazard();
+                       tlb_read_hazard();
 
                        entryhi = read_c0_entryhi();
 
                        /* Don't blow away guest kernel entries */
                        if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0)
                                continue;
+
+                       write_c0_pagemask(old_pagemask);
                }
 
                /* Make sure all entries differ. */
                write_c0_entryhi(UNIQUE_ENTRYHI(entry));
-               mtc0_tlbw_hazard();
                write_c0_entrylo0(0);
-               mtc0_tlbw_hazard();
                write_c0_entrylo1(0);
                mtc0_tlbw_hazard();
 
                tlb_write_indexed();
-               mtc0_tlbw_hazard();
+               tlbw_use_hazard();
        }
 
-       tlbw_use_hazard();
-
        write_c0_entryhi(old_entryhi);
        write_c0_pagemask(old_pagemask);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
 
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
 
-void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
-                            struct kvm_vcpu *vcpu)
-{
-       unsigned long asid = asid_cache(cpu);
-
-       asid += cpu_asid_inc();
-       if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
-               if (cpu_has_vtag_icache)
-                       flush_icache_all();
-
-               kvm_local_flush_tlb_all();      /* start new asid cycle */
-
-               if (!asid)      /* fix version if needed */
-                       asid = asid_first_version(cpu);
-       }
-
-       cpu_context(cpu, mm) = asid_cache(cpu) = asid;
-}
-
 void kvm_local_flush_tlb_all(void)
 {
        unsigned long flags;
@@ -614,185 +373,12 @@ void kvm_local_flush_tlb_all(void)
                write_c0_index(entry);
                mtc0_tlbw_hazard();
                tlb_write_indexed();
+               tlbw_use_hazard();
                entry++;
        }
-       tlbw_use_hazard();
        write_c0_entryhi(old_ctx);
        mtc0_tlbw_hazard();
 
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
-
-/**
- * kvm_mips_migrate_count() - Migrate timer.
- * @vcpu:      Virtual CPU.
- *
- * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
- * if it was running prior to being cancelled.
- *
- * Must be called when the VCPU is migrated to a different CPU to ensure that
- * timer expiry during guest execution interrupts the guest and causes the
- * interrupt to be delivered in a timely manner.
- */
-static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
-{
-       if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
-               hrtimer_restart(&vcpu->arch.comparecount_timer);
-}
-
-/* Restore ASID once we are scheduled back after preemption */
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
-       unsigned long flags;
-       int newasid = 0;
-
-       kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
-
-       /* Allocate new kernel and user ASIDs if needed */
-
-       local_irq_save(flags);
-
-       if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
-                                               asid_version_mask(cpu)) {
-               kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
-               vcpu->arch.guest_kernel_asid[cpu] =
-                   vcpu->arch.guest_kernel_mm.context.asid[cpu];
-               kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
-               vcpu->arch.guest_user_asid[cpu] =
-                   vcpu->arch.guest_user_mm.context.asid[cpu];
-               newasid++;
-
-               kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-                         cpu_context(cpu, current->mm));
-               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
-               kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
-                         vcpu->arch.guest_user_asid[cpu]);
-       }
-
-       if (vcpu->arch.last_sched_cpu != cpu) {
-               kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
-                         vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
-               /*
-                * Migrate the timer interrupt to the current CPU so that it
-                * always interrupts the guest and synchronously triggers a
-                * guest timer interrupt.
-                */
-               kvm_mips_migrate_count(vcpu);
-       }
-
-       if (!newasid) {
-               /*
-                * If we preempted while the guest was executing, then reload
-                * the pre-empted ASID
-                */
-               if (current->flags & PF_VCPU) {
-                       write_c0_entryhi(vcpu->arch.
-                                        preempt_entryhi & asid_mask);
-                       ehb();
-               }
-       } else {
-               /* New ASIDs were allocated for the VM */
-
-               /*
-                * Were we in guest context? If so then the pre-empted ASID is
-                * no longer valid, we need to set it to what it should be based
-                * on the mode of the Guest (Kernel/User)
-                */
-               if (current->flags & PF_VCPU) {
-                       if (KVM_GUEST_KERNEL_MODE(vcpu))
-                               write_c0_entryhi(vcpu->arch.
-                                                guest_kernel_asid[cpu] &
-                                                asid_mask);
-                       else
-                               write_c0_entryhi(vcpu->arch.
-                                                guest_user_asid[cpu] &
-                                                asid_mask);
-                       ehb();
-               }
-       }
-
-       /* restore guest state to registers */
-       kvm_mips_callbacks->vcpu_set_regs(vcpu);
-
-       local_irq_restore(flags);
-
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load);
-
-/* ASID can change if another task is scheduled during preemption */
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       unsigned long flags;
-       uint32_t cpu;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-
-       vcpu->arch.preempt_entryhi = read_c0_entryhi();
-       vcpu->arch.last_sched_cpu = cpu;
-
-       /* save guest state in registers */
-       kvm_mips_callbacks->vcpu_get_regs(vcpu);
-
-       if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
-            asid_version_mask(cpu))) {
-               kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
-                         cpu_context(cpu, current->mm));
-               drop_mmu_context(current->mm, cpu);
-       }
-       write_c0_entryhi(cpu_asid(cpu, current->mm));
-       ehb();
-
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put);
-
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu)
-{
-       struct mips_coproc *cop0 = vcpu->arch.cop0;
-       unsigned long paddr, flags, vpn2, asid;
-       uint32_t inst;
-       int index;
-
-       if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 ||
-           KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-               local_irq_save(flags);
-               index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc);
-               if (index >= 0) {
-                       inst = *(opc);
-               } else {
-                       vpn2 = (unsigned long) opc & VPN2_MASK;
-                       asid = kvm_read_c0_guest_entryhi(cop0) &
-                                               KVM_ENTRYHI_ASID;
-                       index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
-                       if (index < 0) {
-                               kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
-                                       __func__, opc, vcpu, read_c0_entryhi());
-                               kvm_mips_dump_host_tlbs();
-                               local_irq_restore(flags);
-                               return KVM_INVALID_INST;
-                       }
-                       kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
-                                                            &vcpu->arch.
-                                                            guest_tlb[index],
-                                                            NULL, NULL);
-                       inst = *(opc);
-               }
-               local_irq_restore(flags);
-       } else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-               paddr =
-                   kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
-                                                         (unsigned long) opc);
-               inst = *(uint32_t *) CKSEG0ADDR(paddr);
-       } else {
-               kvm_err("%s: illegal address: %p\n", __func__, opc);
-               return KVM_INVALID_INST;
-       }
-
-       return inst;
-}
-EXPORT_SYMBOL_GPL(kvm_get_inst);
index bd6437f..c858cf1 100644 (file)
 #define TRACE_INCLUDE_PATH .
 #define TRACE_INCLUDE_FILE trace
 
-/* Tracepoints for VM eists */
-extern char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES];
+/*
+ * Tracepoints for VM enters
+ */
+DECLARE_EVENT_CLASS(kvm_transition,
+       TP_PROTO(struct kvm_vcpu *vcpu),
+       TP_ARGS(vcpu),
+       TP_STRUCT__entry(
+               __field(unsigned long, pc)
+       ),
+
+       TP_fast_assign(
+               __entry->pc = vcpu->arch.pc;
+       ),
+
+       TP_printk("PC: 0x%08lx",
+                 __entry->pc)
+);
+
+DEFINE_EVENT(kvm_transition, kvm_enter,
+            TP_PROTO(struct kvm_vcpu *vcpu),
+            TP_ARGS(vcpu));
+
+DEFINE_EVENT(kvm_transition, kvm_reenter,
+            TP_PROTO(struct kvm_vcpu *vcpu),
+            TP_ARGS(vcpu));
+
+DEFINE_EVENT(kvm_transition, kvm_out,
+            TP_PROTO(struct kvm_vcpu *vcpu),
+            TP_ARGS(vcpu));
+
+/* The first 32 exit reasons correspond to Cause.ExcCode */
+#define KVM_TRACE_EXIT_INT              0
+#define KVM_TRACE_EXIT_TLBMOD           1
+#define KVM_TRACE_EXIT_TLBMISS_LD       2
+#define KVM_TRACE_EXIT_TLBMISS_ST       3
+#define KVM_TRACE_EXIT_ADDRERR_LD       4
+#define KVM_TRACE_EXIT_ADDRERR_ST       5
+#define KVM_TRACE_EXIT_SYSCALL          8
+#define KVM_TRACE_EXIT_BREAK_INST       9
+#define KVM_TRACE_EXIT_RESVD_INST      10
+#define KVM_TRACE_EXIT_COP_UNUSABLE    11
+#define KVM_TRACE_EXIT_TRAP_INST       13
+#define KVM_TRACE_EXIT_MSA_FPE         14
+#define KVM_TRACE_EXIT_FPE             15
+#define KVM_TRACE_EXIT_MSA_DISABLED    21
+/* Further exit reasons */
+#define KVM_TRACE_EXIT_WAIT            32
+#define KVM_TRACE_EXIT_CACHE           33
+#define KVM_TRACE_EXIT_SIGNAL          34
+
+/* Tracepoints for VM exits */
+#define kvm_trace_symbol_exit_types                            \
+       { KVM_TRACE_EXIT_INT,           "Interrupt" },          \
+       { KVM_TRACE_EXIT_TLBMOD,        "TLB Mod" },            \
+       { KVM_TRACE_EXIT_TLBMISS_LD,    "TLB Miss (LD)" },      \
+       { KVM_TRACE_EXIT_TLBMISS_ST,    "TLB Miss (ST)" },      \
+       { KVM_TRACE_EXIT_ADDRERR_LD,    "Address Error (LD)" }, \
+       { KVM_TRACE_EXIT_ADDRERR_ST,    "Address Err (ST)" },   \
+       { KVM_TRACE_EXIT_SYSCALL,       "System Call" },        \
+       { KVM_TRACE_EXIT_BREAK_INST,    "Break Inst" },         \
+       { KVM_TRACE_EXIT_RESVD_INST,    "Reserved Inst" },      \
+       { KVM_TRACE_EXIT_COP_UNUSABLE,  "COP0/1 Unusable" },    \
+       { KVM_TRACE_EXIT_TRAP_INST,     "Trap Inst" },          \
+       { KVM_TRACE_EXIT_MSA_FPE,       "MSA FPE" },            \
+       { KVM_TRACE_EXIT_FPE,           "FPE" },                \
+       { KVM_TRACE_EXIT_MSA_DISABLED,  "MSA Disabled" },       \
+       { KVM_TRACE_EXIT_WAIT,          "WAIT" },               \
+       { KVM_TRACE_EXIT_CACHE,         "CACHE" },              \
+       { KVM_TRACE_EXIT_SIGNAL,        "Signal" }
 
 TRACE_EVENT(kvm_exit,
            TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -34,10 +101,173 @@ TRACE_EVENT(kvm_exit,
            ),
 
            TP_printk("[%s]PC: 0x%08lx",
-                     kvm_mips_exit_types_str[__entry->reason],
+                     __print_symbolic(__entry->reason,
+                                      kvm_trace_symbol_exit_types),
                      __entry->pc)
 );
 
+#define KVM_TRACE_MFC0         0
+#define KVM_TRACE_MTC0         1
+#define KVM_TRACE_DMFC0                2
+#define KVM_TRACE_DMTC0                3
+#define KVM_TRACE_RDHWR                4
+
+#define KVM_TRACE_HWR_COP0     0
+#define KVM_TRACE_HWR_HWR      1
+
+#define KVM_TRACE_COP0(REG, SEL)       ((KVM_TRACE_HWR_COP0 << 8) |    \
+                                        ((REG) << 3) | (SEL))
+#define KVM_TRACE_HWR(REG, SEL)                ((KVM_TRACE_HWR_HWR  << 8) |    \
+                                        ((REG) << 3) | (SEL))
+
+#define kvm_trace_symbol_hwr_ops                               \
+       { KVM_TRACE_MFC0,               "MFC0" },               \
+       { KVM_TRACE_MTC0,               "MTC0" },               \
+       { KVM_TRACE_DMFC0,              "DMFC0" },              \
+       { KVM_TRACE_DMTC0,              "DMTC0" },              \
+       { KVM_TRACE_RDHWR,              "RDHWR" }
+
+#define kvm_trace_symbol_hwr_cop                               \
+       { KVM_TRACE_HWR_COP0,           "COP0" },               \
+       { KVM_TRACE_HWR_HWR,            "HWR" }
+
+#define kvm_trace_symbol_hwr_regs                              \
+       { KVM_TRACE_COP0( 0, 0),        "Index" },              \
+       { KVM_TRACE_COP0( 2, 0),        "EntryLo0" },           \
+       { KVM_TRACE_COP0( 3, 0),        "EntryLo1" },           \
+       { KVM_TRACE_COP0( 4, 0),        "Context" },            \
+       { KVM_TRACE_COP0( 4, 2),        "UserLocal" },          \
+       { KVM_TRACE_COP0( 5, 0),        "PageMask" },           \
+       { KVM_TRACE_COP0( 6, 0),        "Wired" },              \
+       { KVM_TRACE_COP0( 7, 0),        "HWREna" },             \
+       { KVM_TRACE_COP0( 8, 0),        "BadVAddr" },           \
+       { KVM_TRACE_COP0( 9, 0),        "Count" },              \
+       { KVM_TRACE_COP0(10, 0),        "EntryHi" },            \
+       { KVM_TRACE_COP0(11, 0),        "Compare" },            \
+       { KVM_TRACE_COP0(12, 0),        "Status" },             \
+       { KVM_TRACE_COP0(12, 1),        "IntCtl" },             \
+       { KVM_TRACE_COP0(12, 2),        "SRSCtl" },             \
+       { KVM_TRACE_COP0(13, 0),        "Cause" },              \
+       { KVM_TRACE_COP0(14, 0),        "EPC" },                \
+       { KVM_TRACE_COP0(15, 0),        "PRId" },               \
+       { KVM_TRACE_COP0(15, 1),        "EBase" },              \
+       { KVM_TRACE_COP0(16, 0),        "Config" },             \
+       { KVM_TRACE_COP0(16, 1),        "Config1" },            \
+       { KVM_TRACE_COP0(16, 2),        "Config2" },            \
+       { KVM_TRACE_COP0(16, 3),        "Config3" },            \
+       { KVM_TRACE_COP0(16, 4),        "Config4" },            \
+       { KVM_TRACE_COP0(16, 5),        "Config5" },            \
+       { KVM_TRACE_COP0(16, 7),        "Config7" },            \
+       { KVM_TRACE_COP0(26, 0),        "ECC" },                \
+       { KVM_TRACE_COP0(30, 0),        "ErrorEPC" },           \
+       { KVM_TRACE_COP0(31, 2),        "KScratch1" },          \
+       { KVM_TRACE_COP0(31, 3),        "KScratch2" },          \
+       { KVM_TRACE_COP0(31, 4),        "KScratch3" },          \
+       { KVM_TRACE_COP0(31, 5),        "KScratch4" },          \
+       { KVM_TRACE_COP0(31, 6),        "KScratch5" },          \
+       { KVM_TRACE_COP0(31, 7),        "KScratch6" },          \
+       { KVM_TRACE_HWR( 0, 0),         "CPUNum" },             \
+       { KVM_TRACE_HWR( 1, 0),         "SYNCI_Step" },         \
+       { KVM_TRACE_HWR( 2, 0),         "CC" },                 \
+       { KVM_TRACE_HWR( 3, 0),         "CCRes" },              \
+       { KVM_TRACE_HWR(29, 0),         "ULR" }
+
+TRACE_EVENT(kvm_hwr,
+           TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, unsigned int reg,
+                    unsigned long val),
+           TP_ARGS(vcpu, op, reg, val),
+           TP_STRUCT__entry(
+                       __field(unsigned long, val)
+                       __field(u16, reg)
+                       __field(u8, op)
+           ),
+
+           TP_fast_assign(
+                       __entry->val = val;
+                       __entry->reg = reg;
+                       __entry->op = op;
+           ),
+
+           TP_printk("%s %s (%s:%u:%u) 0x%08lx",
+                     __print_symbolic(__entry->op,
+                                      kvm_trace_symbol_hwr_ops),
+                     __print_symbolic(__entry->reg,
+                                      kvm_trace_symbol_hwr_regs),
+                     __print_symbolic(__entry->reg >> 8,
+                                      kvm_trace_symbol_hwr_cop),
+                     (__entry->reg >> 3) & 0x1f,
+                     __entry->reg & 0x7,
+                     __entry->val)
+);
+
+#define KVM_TRACE_AUX_RESTORE          0
+#define KVM_TRACE_AUX_SAVE             1
+#define KVM_TRACE_AUX_ENABLE           2
+#define KVM_TRACE_AUX_DISABLE          3
+#define KVM_TRACE_AUX_DISCARD          4
+
+#define KVM_TRACE_AUX_FPU              1
+#define KVM_TRACE_AUX_MSA              2
+#define KVM_TRACE_AUX_FPU_MSA          3
+
+#define kvm_trace_symbol_aux_op                \
+       { KVM_TRACE_AUX_RESTORE, "restore" },   \
+       { KVM_TRACE_AUX_SAVE,    "save" },      \
+       { KVM_TRACE_AUX_ENABLE,  "enable" },    \
+       { KVM_TRACE_AUX_DISABLE, "disable" },   \
+       { KVM_TRACE_AUX_DISCARD, "discard" }
+
+#define kvm_trace_symbol_aux_state             \
+       { KVM_TRACE_AUX_FPU,     "FPU" },       \
+       { KVM_TRACE_AUX_MSA,     "MSA" },       \
+       { KVM_TRACE_AUX_FPU_MSA, "FPU & MSA" }
+
+TRACE_EVENT(kvm_aux,
+           TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op,
+                    unsigned int state),
+           TP_ARGS(vcpu, op, state),
+           TP_STRUCT__entry(
+                       __field(unsigned long, pc)
+                       __field(u8, op)
+                       __field(u8, state)
+           ),
+
+           TP_fast_assign(
+                       __entry->pc = vcpu->arch.pc;
+                       __entry->op = op;
+                       __entry->state = state;
+           ),
+
+           TP_printk("%s %s PC: 0x%08lx",
+                     __print_symbolic(__entry->op,
+                                      kvm_trace_symbol_aux_op),
+                     __print_symbolic(__entry->state,
+                                      kvm_trace_symbol_aux_state),
+                     __entry->pc)
+);
+
+TRACE_EVENT(kvm_asid_change,
+           TP_PROTO(struct kvm_vcpu *vcpu, unsigned int old_asid,
+                    unsigned int new_asid),
+           TP_ARGS(vcpu, old_asid, new_asid),
+           TP_STRUCT__entry(
+                       __field(unsigned long, pc)
+                       __field(u8, old_asid)
+                       __field(u8, new_asid)
+           ),
+
+           TP_fast_assign(
+                       __entry->pc = vcpu->arch.pc;
+                       __entry->old_asid = old_asid;
+                       __entry->new_asid = new_asid;
+           ),
+
+           TP_printk("PC: 0x%08lx old: 0x%02x new: 0x%02x",
+                     __entry->pc,
+                     __entry->old_asid,
+                     __entry->new_asid)
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
index 6ba0faf..0915539 100644 (file)
@@ -21,7 +21,7 @@
 static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
 {
        gpa_t gpa;
-       uint32_t kseg = KSEGX(gva);
+       gva_t kseg = KSEGX(gva);
 
        if ((kseg == CKSEG0) || (kseg == CKSEG1))
                gpa = CPHYSADDR(gva);
@@ -40,8 +40,8 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -87,15 +87,15 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
        if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
            || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-               kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
                          cause, opc, badvaddr);
                er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
 
@@ -111,14 +111,14 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
                 * when we are not using HIGHMEM. Need to address this in a
                 * HIGHMEM kernel
                 */
-               kvm_err("TLB MOD fault not handled, cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n",
                        cause, opc, badvaddr);
                kvm_mips_dump_host_tlbs();
                kvm_arch_vcpu_dump_regs(vcpu);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                ret = RESUME_HOST;
        } else {
-               kvm_err("Illegal TLB Mod fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
                        cause, opc, badvaddr);
                kvm_mips_dump_host_tlbs();
                kvm_arch_vcpu_dump_regs(vcpu);
@@ -128,59 +128,12 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
        return ret;
 }
 
-static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
-{
-       struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
-       enum emulation_result er = EMULATE_DONE;
-       int ret = RESUME_GUEST;
-
-       if (((badvaddr & PAGE_MASK) == KVM_GUEST_COMMPAGE_ADDR)
-           && KVM_GUEST_KERNEL_MODE(vcpu)) {
-               if (kvm_mips_handle_commpage_tlb_fault(badvaddr, vcpu) < 0) {
-                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       ret = RESUME_HOST;
-               }
-       } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
-                  || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-               kvm_debug("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
-                         cause, opc, badvaddr);
-               er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
-               if (er == EMULATE_DONE)
-                       ret = RESUME_GUEST;
-               else {
-                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       ret = RESUME_HOST;
-               }
-       } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
-               /*
-                * All KSEG0 faults are handled by KVM, as the guest kernel does
-                * not expect to ever get them
-                */
-               if (kvm_mips_handle_kseg0_tlb_fault
-                   (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
-                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       ret = RESUME_HOST;
-               }
-       } else {
-               kvm_err("Illegal TLB LD fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
-                       cause, opc, badvaddr);
-               kvm_mips_dump_host_tlbs();
-               kvm_arch_vcpu_dump_regs(vcpu);
-               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               ret = RESUME_HOST;
-       }
-       return ret;
-}
-
-static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -192,8 +145,8 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
                }
        } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
                   || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-               kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n",
-                         vcpu->arch.pc, badvaddr);
+               kvm_debug("USER ADDR TLB %s fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
+                         store ? "ST" : "LD", cause, opc, badvaddr);
 
                /*
                 * User Address (UA) fault, this could happen if
@@ -213,14 +166,18 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
                        ret = RESUME_HOST;
                }
        } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+               /*
+                * All KSEG0 faults are handled by KVM, as the guest kernel does
+                * not expect to ever get them
+                */
                if (kvm_mips_handle_kseg0_tlb_fault
                    (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
                        run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                        ret = RESUME_HOST;
                }
        } else {
-               kvm_err("Illegal TLB ST fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
-                       cause, opc, badvaddr);
+               kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
+                       store ? "ST" : "LD", cause, opc, badvaddr);
                kvm_mips_dump_host_tlbs();
                kvm_arch_vcpu_dump_regs(vcpu);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -229,12 +186,22 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
        return ret;
 }
 
+static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+       return kvm_trap_emul_handle_tlb_miss(vcpu, true);
+}
+
+static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+       return kvm_trap_emul_handle_tlb_miss(vcpu, false);
+}
+
 static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -251,7 +218,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
                        ret = RESUME_HOST;
                }
        } else {
-               kvm_err("Address Error (STORE): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
                        cause, opc, badvaddr);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                ret = RESUME_HOST;
@@ -262,9 +229,9 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -280,7 +247,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
                        ret = RESUME_HOST;
                }
        } else {
-               kvm_err("Address Error (LOAD): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
                        cause, opc, badvaddr);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                ret = RESUME_HOST;
@@ -292,8 +259,8 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -310,8 +277,8 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -328,8 +295,8 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -346,8 +313,8 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -364,8 +331,8 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -382,8 +349,8 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -407,8 +374,8 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -451,24 +418,41 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm)
 
 static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
+       vcpu->arch.kscratch_enabled = 0xfc;
+
        return 0;
 }
 
 static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t config1;
+       u32 config, config1;
        int vcpu_id = vcpu->vcpu_id;
 
        /*
         * Arch specific stuff, set up config registers properly so that the
-        * guest will come up as expected, for now we simulate a MIPS 24kc
+        * guest will come up as expected
         */
+#ifndef CONFIG_CPU_MIPSR6
+       /* r2-r5, simulate a MIPS 24kc */
        kvm_write_c0_guest_prid(cop0, 0x00019300);
-       /* Have config1, Cacheable, noncoherent, write-back, write allocate */
-       kvm_write_c0_guest_config(cop0, MIPS_CONF_M | (0x3 << CP0C0_K0) |
-                                 (0x1 << CP0C0_AR) |
-                                 (MMU_TYPE_R4000 << CP0C0_MT));
+#else
+       /* r6+, simulate a generic QEMU machine */
+       kvm_write_c0_guest_prid(cop0, 0x00010000);
+#endif
+       /*
+        * Have config1, Cacheable, noncoherent, write-back, write allocate.
+        * Endianness, arch revision & virtually tagged icache should match
+        * host.
+        */
+       config = read_c0_config() & MIPS_CONF_AR;
+       config |= MIPS_CONF_M | CONF_CM_CACHABLE_NONCOHERENT | MIPS_CONF_MT_TLB;
+#ifdef CONFIG_CPU_BIG_ENDIAN
+       config |= CONF_BE;
+#endif
+       if (cpu_has_vtag_icache)
+               config |= MIPS_CONF_VI;
+       kvm_write_c0_guest_config(cop0, config);
 
        /* Read the cache characteristics from the host Config1 Register */
        config1 = (read_c0_config1() & ~0x7f);
@@ -478,9 +462,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25);
 
        /* We unset some bits that we aren't emulating */
-       config1 &=
-           ~((1 << CP0C1_C2) | (1 << CP0C1_MD) | (1 << CP0C1_PC) |
-             (1 << CP0C1_WR) | (1 << CP0C1_CA));
+       config1 &= ~(MIPS_CONF1_C2 | MIPS_CONF1_MD | MIPS_CONF1_PC |
+                    MIPS_CONF1_WR | MIPS_CONF1_CA);
        kvm_write_c0_guest_config1(cop0, config1);
 
        /* Have config3, no tertiary/secondary caches implemented */
@@ -511,6 +494,17 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
+                                         u64 __user *indices)
+{
+       return 0;
+}
+
 static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
                                     const struct kvm_one_reg *reg,
                                     s64 *v)
@@ -660,6 +654,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        .dequeue_io_int = kvm_mips_dequeue_io_int_cb,
        .irq_deliver = kvm_mips_irq_deliver_cb,
        .irq_clear = kvm_mips_irq_clear_cb,
+       .num_regs = kvm_trap_emul_num_regs,
+       .copy_reg_indices = kvm_trap_emul_copy_reg_indices,
        .get_one_reg = kvm_trap_emul_get_one_reg,
        .set_one_reg = kvm_trap_emul_set_one_reg,
        .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,
index d96e912..6dc07fb 100644 (file)
@@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
                                dec_insn.pc_inc +
                                dec_insn.next_pc_inc;
                return 1;
-       case cbcond0_op:
-       case cbcond1_op:
+       case pop10_op:
+       case pop30_op:
                if (!cpu_has_mips_r6)
                        break;
                if (insn.i_format.rt && !insn.i_format.rs)
@@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
                        dec_insn.next_pc_inc;
 
                return 1;
-       case beqzcjic_op:
+       case pop66_op:
                if (!cpu_has_mips_r6)
                        break;
                *contpc = regs->cp0_epc + dec_insn.pc_inc +
                        dec_insn.next_pc_inc;
 
                return 1;
-       case bnezcjialc_op:
+       case pop76_op:
                if (!cpu_has_mips_r6)
                        break;
                if (!insn.i_format.rs)
index ef7f925..7a9c345 100644 (file)
@@ -1206,7 +1206,7 @@ static void probe_pcache(void)
                              c->icache.linesz;
                c->icache.waybit = __ffs(icache_size/c->icache.ways);
 
-               if (config & 0x8)               /* VI bit */
+               if (config & MIPS_CONF_VI)
                        c->icache.flags |= MIPS_CACHE_VTAG;
 
                /*
index d78178d..277cf52 100644 (file)
@@ -53,8 +53,13 @@ static struct insn insn_table_MM[] = {
        { insn_bltzl, 0, 0 },
        { insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM },
        { insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM },
+       { insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS },
+       { insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE },
+       { insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS },
+       { insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE },
        { insn_daddu, 0, 0 },
        { insn_daddiu, 0, 0 },
+       { insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS },
        { insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS },
        { insn_dmfc0, 0, 0 },
        { insn_dmtc0, 0, 0 },
@@ -84,6 +89,8 @@ static struct insn insn_table_MM[] = {
        { insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS },
        { insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS },
        { insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD },
+       { insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS },
+       { insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS },
        { insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD },
        { insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD },
        { insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM },
@@ -166,13 +173,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...)
        op = ip->match;
        va_start(ap, opc);
        if (ip->fields & RS) {
-               if (opc == insn_mfc0 || opc == insn_mtc0)
+               if (opc == insn_mfc0 || opc == insn_mtc0 ||
+                   opc == insn_cfc1 || opc == insn_ctc1)
                        op |= build_rt(va_arg(ap, u32));
                else
                        op |= build_rs(va_arg(ap, u32));
        }
        if (ip->fields & RT) {
-               if (opc == insn_mfc0 || opc == insn_mtc0)
+               if (opc == insn_mfc0 || opc == insn_mtc0 ||
+                   opc == insn_cfc1 || opc == insn_ctc1)
                        op |= build_rs(va_arg(ap, u32));
                else
                        op |= build_rt(va_arg(ap, u32));
index 9c2220a..cec5241 100644 (file)
@@ -67,9 +67,14 @@ static struct insn insn_table[] = {
 #else
        { insn_cache,  M6(cache_op, 0, 0, 0, cache6_op),  RS | RT | SIMM9 },
 #endif
+       { insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD },
+       { insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE },
+       { insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD },
+       { insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE },
        { insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
        { insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },
        { insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE },
+       { insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT },
        { insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE },
        { insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT },
        { insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET},
@@ -114,7 +119,13 @@ static struct insn insn_table[] = {
        { insn_mflo,  M(spec_op, 0, 0, 0, 0, mflo_op), RD },
        { insn_mtc0,  M(cop0_op, mtc_op, 0, 0, 0, 0),  RT | RD | SET},
        { insn_mthc0,  M(cop0_op, mthc0_op, 0, 0, 0, 0),  RT | RD | SET},
+       { insn_mthi,  M(spec_op, 0, 0, 0, 0, mthi_op), RS },
+       { insn_mtlo,  M(spec_op, 0, 0, 0, 0, mtlo_op), RS },
+#ifndef CONFIG_CPU_MIPSR6
        { insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
+#else
+       { insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD},
+#endif
        { insn_ori,  M(ori_op, 0, 0, 0, 0, 0),  RS | RT | UIMM },
        { insn_or,  M(spec_op, 0, 0, 0, 0, or_op),  RS | RT | RD },
 #ifndef CONFIG_CPU_MIPSR6
index ad718de..3e0282d 100644 (file)
@@ -49,18 +49,19 @@ enum opcode {
        insn_invalid,
        insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1,
        insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl,
-       insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm,
-       insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
+       insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa,
+       insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu,
+       insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
        insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret,
        insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb,
        insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw,
        insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0,
-       insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe,
-       insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt,
-       insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu,
-       insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi,
-       insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield,
-       insn_lddir, insn_ldpte,
+       insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori,
+       insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll,
+       insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl,
+       insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
+       insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
+       insn_xori, insn_yield, insn_lddir, insn_ldpte,
 };
 
 struct insn {
@@ -268,10 +269,15 @@ I_u1s2(_bltz)
 I_u1s2(_bltzl)
 I_u1u2s3(_bne)
 I_u2s3u1(_cache)
+I_u1u2(_cfc1)
+I_u2u1(_cfcmsa)
+I_u1u2(_ctc1)
+I_u2u1(_ctcmsa)
 I_u1u2u3(_dmfc0)
 I_u1u2u3(_dmtc0)
 I_u2u1s3(_daddiu)
 I_u3u1u2(_daddu)
+I_u1(_di);
 I_u1u2(_divu)
 I_u2u1u3(_dsll)
 I_u2u1u3(_dsll32)
@@ -301,6 +307,8 @@ I_u1(_mfhi)
 I_u1(_mflo)
 I_u1u2u3(_mtc0)
 I_u1u2u3(_mthc0)
+I_u1(_mthi)
+I_u1(_mtlo)
 I_u3u1u2(_mul)
 I_u2u1u3(_ori)
 I_u3u1u2(_or)
index f1b11f0..b4c02f2 100644 (file)
@@ -112,7 +112,14 @@ static void pcibios_scanbus(struct pci_controller *hose)
                need_domain_info = 1;
        }
 
-       if (!pci_has_flag(PCI_PROBE_ONLY)) {
+       /*
+        * We insert PCI resources into the iomem_resource and
+        * ioport_resource trees in either pci_bus_claim_resources()
+        * or pci_bus_assign_resources().
+        */
+       if (pci_has_flag(PCI_PROBE_ONLY)) {
+               pci_bus_claim_resources(bus);
+       } else {
                pci_bus_size_bridges(bus);
                pci_bus_assign_resources(bus);
        }
@@ -319,6 +326,16 @@ void pcibios_fixup_bus(struct pci_bus *bus)
 EXPORT_SYMBOL(PCIBIOS_MIN_IO);
 EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
 
+void pci_resource_to_user(const struct pci_dev *dev, int bar,
+                         const struct resource *rsrc, resource_size_t *start,
+                         resource_size_t *end)
+{
+       phys_addr_t size = resource_size(rsrc);
+
+       *start = fixup_bigphys_addr(rsrc->start, size);
+       *end = rsrc->start + size;
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                        enum pci_mmap_state mmap_state, int write_combine)
 {
index 4cd612a..1a2a6e8 100644 (file)
@@ -43,7 +43,7 @@ ifeq ($(call cc-option-yn, -fstack-protector),y)
 BOOTCFLAGS     += -fno-stack-protector
 endif
 
-BOOTCFLAGS     += -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS     += -I$(objtree)/$(obj) -I$(srctree)/$(obj)
 
 DTC_FLAGS      ?= -p 1024
 
diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
new file mode 100644 (file)
index 0000000..88b4901
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Hypervisor Maintenance Interrupt header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __ASM_PPC64_HMI_H__
+#define __ASM_PPC64_HMI_H__
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+#define        CORE_TB_RESYNC_REQ_BIT          63
+#define MAX_SUBCORE_PER_CORE           4
+
+/*
+ * sibling_subcore_state structure is used to co-ordinate all threads
+ * during HMI to avoid TB corruption. This structure is allocated once
+ * per each core and shared by all threads on that core.
+ */
+struct sibling_subcore_state {
+       unsigned long   flags;
+       u8              in_guest[MAX_SUBCORE_PER_CORE];
+};
+
+extern void wait_for_subcore_guest_exit(void);
+extern void wait_for_tb_resync(void);
+#else
+static inline void wait_for_subcore_guest_exit(void) { }
+static inline void wait_for_tb_resync(void) { }
+#endif
+#endif /* __ASM_PPC64_HMI_H__ */
index ad171e9..148303e 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/kvm_book3s_asm.h>
 #endif
 #include <asm/accounting.h>
+#include <asm/hmi.h>
 
 register struct paca_struct *local_paca asm("r13");
 
@@ -182,6 +183,11 @@ struct paca_struct {
         */
        u16 in_mce;
        u8 hmi_event_available;          /* HMI event is available */
+       /*
+        * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+        * more details
+        */
+       struct sibling_subcore_state *sibling_subcore_state;
 #endif
 
        /* Stuff for accurate time accounting */
index a6f3ac0..e9bd6cf 100644 (file)
@@ -136,9 +136,6 @@ extern pgprot_t     pci_phys_mem_access_prot(struct file *file,
                                         pgprot_t prot);
 
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
-extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
-                                const struct resource *rsrc,
-                                resource_size_t *start, resource_size_t *end);
 
 extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose);
 extern void pcibios_setup_bus_devices(struct pci_bus *bus);
index fe4c075..b2027a5 100644 (file)
@@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32)          += vdso32/
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)       += hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)    += cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)    += cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64)    += mce.o mce_power.o
+obj-$(CONFIG_PPC_BOOK3S_64)    += mce.o mce_power.o hmi.o
 obj-$(CONFIG_PPC_BOOK3E_64)    += exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC64)            += vdso64/
 obj-$(CONFIG_ALTIVEC)          += vecemu.o
index 6200e49..694def6 100644 (file)
@@ -671,6 +671,8 @@ BEGIN_FTR_SECTION
        beq     h_doorbell_common
        cmpwi   r3,0xea0
        beq     h_virt_irq_common
+       cmpwi   r3,0xe60
+       beq     hmi_exception_common
 FTR_SECTION_ELSE
        cmpwi   r3,0xa00
        beq     doorbell_super_common
@@ -1172,7 +1174,7 @@ fwnmi_data_area:
 
        .globl hmi_exception_early
 hmi_exception_early:
-       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+       EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
        mr      r10,r1                  /* Save r1                      */
        ld      r1,PACAEMERGSP(r13)     /* Use emergency stack          */
        subi    r1,r1,INT_FRAME_SIZE    /* alloc stack frame            */
diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c
new file mode 100644 (file)
index 0000000..e3f738e
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+       int i;
+
+       /*
+        * NULL bitmap pointer indicates that KVM module hasn't
+        * been loaded yet and hence no guests are running.
+        * If no KVM is in use, no need to co-ordinate among threads
+        * as all of them will always be in host and no one is going
+        * to modify TB other than the opal hmi handler.
+        * Hence, just return from here.
+        */
+       if (!local_paca->sibling_subcore_state)
+               return;
+
+       for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+               while (local_paca->sibling_subcore_state->in_guest[i])
+                       cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+       if (!local_paca->sibling_subcore_state)
+               return;
+
+       while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+                               &local_paca->sibling_subcore_state->flags))
+               cpu_relax();
+}
index 335eb6c..8a56a51 100644 (file)
@@ -336,7 +336,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);            \
        ld      r2,PACATOC(r13);                                        \
        ld      r1,PACAR1(r13);                                         \
        std     r3,ORIG_GPR3(r1);       /* Save original r3 */          \
-       bl      opal_rm_handle_hmi;                                     \
+       li      r3,0;                   /* NULL argument */             \
+       bl      hmi_exception_realmode;                                 \
+       nop;                                                            \
        ld      r3,ORIG_GPR3(r1);       /* Restore original r3 */       \
 20:    nop;
 
index f93942b..a5c0153 100644 (file)
@@ -411,36 +411,6 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
        return NULL;
 }
 
-/*
- * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
- * device mapping.
- */
-static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
-                                     pgprot_t protection,
-                                     enum pci_mmap_state mmap_state,
-                                     int write_combine)
-{
-
-       /* Write combine is always 0 on non-memory space mappings. On
-        * memory space, if the user didn't pass 1, we check for a
-        * "prefetchable" resource. This is a bit hackish, but we use
-        * this to workaround the inability of /sysfs to provide a write
-        * combine bit
-        */
-       if (mmap_state != pci_mmap_mem)
-               write_combine = 0;
-       else if (write_combine == 0) {
-               if (rp->flags & IORESOURCE_PREFETCH)
-                       write_combine = 1;
-       }
-
-       /* XXX would be nice to have a way to ask for write-through */
-       if (write_combine)
-               return pgprot_noncached_wc(protection);
-       else
-               return pgprot_noncached(protection);
-}
-
 /*
  * This one is used by /dev/mem and fbdev who have no clue about the
  * PCI device, it tries to find the PCI device first and calls the
@@ -514,9 +484,10 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                return -EINVAL;
 
        vma->vm_pgoff = offset >> PAGE_SHIFT;
-       vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
-                                                 vma->vm_page_prot,
-                                                 mmap_state, write_combine);
+       if (write_combine)
+               vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
+       else
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
        ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
                               vma->vm_end - vma->vm_start, vma->vm_page_prot);
@@ -666,39 +637,25 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar,
                          const struct resource *rsrc,
                          resource_size_t *start, resource_size_t *end)
 {
-       struct pci_controller *hose = pci_bus_to_host(dev->bus);
-       resource_size_t offset = 0;
+       struct pci_bus_region region;
 
-       if (hose == NULL)
+       if (rsrc->flags & IORESOURCE_IO) {
+               pcibios_resource_to_bus(dev->bus, &region,
+                                       (struct resource *) rsrc);
+               *start = region.start;
+               *end = region.end;
                return;
+       }
 
-       if (rsrc->flags & IORESOURCE_IO)
-               offset = (unsigned long)hose->io_base_virt - _IO_BASE;
-
-       /* We pass a fully fixed up address to userland for MMIO instead of
-        * a BAR value because X is lame and expects to be able to use that
-        * to pass to /dev/mem !
-        *
-        * That means that we'll have potentially 64 bits values where some
-        * userland apps only expect 32 (like X itself since it thinks only
-        * Sparc has 64 bits MMIO) but if we don't do that, we break it on
-        * 32 bits CHRPs :-(
-        *
-        * Hopefully, the sysfs insterface is immune to that gunk. Once X
-        * has been fixed (and the fix spread enough), we can re-enable the
-        * 2 lines below and pass down a BAR value to userland. In that case
-        * we'll also have to re-enable the matching code in
-        * __pci_mmap_make_offset().
+       /* We pass a CPU physical address to userland for MMIO instead of a
+        * BAR value because X is lame and expects to be able to use that
+        * to pass to /dev/mem!
         *
-        * BenH.
+        * That means we may have 64-bit values where some apps only expect
+        * 32 (like X itself since it thinks only Sparc has 64-bit MMIO).
         */
-#if 0
-       else if (rsrc->flags & IORESOURCE_MEM)
-               offset = hose->pci_mem_offset;
-#endif
-
-       *start = rsrc->start - offset;
-       *end = rsrc->end - offset;
+       *start = rsrc->start;
+       *end = rsrc->end;
 }
 
 /**
index f7e2f2e..2cb5892 100644 (file)
@@ -61,6 +61,7 @@
 #include <asm/tm.h>
 #include <asm/debug.h>
 #include <asm/asm-prototypes.h>
+#include <asm/hmi.h>
 #include <sysdev/fsl_pci.h>
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
@@ -308,9 +309,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
 {
        __this_cpu_inc(irq_stat.hmi_exceptions);
 
+       wait_for_subcore_guest_exit();
+
        if (ppc_md.hmi_exception_early)
                ppc_md.hmi_exception_early(regs);
 
+       wait_for_tb_resync();
+
        return 0;
 }
 
index eba0bea..1f9e552 100644 (file)
@@ -20,7 +20,7 @@ common-objs-y += powerpc.o emulate.o emulate_loadstore.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
 obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
 
-AFLAGS_booke_interrupts.o := -I$(obj)
+AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
 
 kvm-e500-objs := \
        $(common-objs-y) \
index e20beae..2fd5580 100644 (file)
@@ -52,6 +52,7 @@
 #include <asm/switch_to.h>
 #include <asm/smp.h>
 #include <asm/dbell.h>
+#include <asm/hmi.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -2522,7 +2523,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
                        spin_unlock(&pvc->lock);
 
-       kvm_guest_enter();
+       guest_enter();
 
        srcu_idx = srcu_read_lock(&vc->kvm->srcu);
 
@@ -2570,7 +2571,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
        /* make sure updates to secondary vcpu structs are visible now */
        smp_mb();
-       kvm_guest_exit();
+       guest_exit();
 
        for (sub = 0; sub < core_info.n_subcores; ++sub)
                list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
@@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = {
        .hcall_implemented = kvmppc_hcall_impl_hv,
 };
 
+static int kvm_init_subcore_bitmap(void)
+{
+       int i, j;
+       int nr_cores = cpu_nr_cores();
+       struct sibling_subcore_state *sibling_subcore_state;
+
+       for (i = 0; i < nr_cores; i++) {
+               int first_cpu = i * threads_per_core;
+               int node = cpu_to_node(first_cpu);
+
+               /* Ignore if it is already allocated. */
+               if (paca[first_cpu].sibling_subcore_state)
+                       continue;
+
+               sibling_subcore_state =
+                       kmalloc_node(sizeof(struct sibling_subcore_state),
+                                                       GFP_KERNEL, node);
+               if (!sibling_subcore_state)
+                       return -ENOMEM;
+
+               memset(sibling_subcore_state, 0,
+                               sizeof(struct sibling_subcore_state));
+
+               for (j = 0; j < threads_per_core; j++) {
+                       int cpu = first_cpu + j;
+
+                       paca[cpu].sibling_subcore_state = sibling_subcore_state;
+               }
+       }
+       return 0;
+}
+
 static int kvmppc_book3s_init_hv(void)
 {
        int r;
@@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void)
        if (r < 0)
                return -ENODEV;
 
+       r = kvm_init_subcore_bitmap();
+       if (r)
+               return r;
+
        kvm_ops_hv.owner = THIS_MODULE;
        kvmppc_hv_ops = &kvm_ops_hv;
 
index 93b5f5c..0fa70a9 100644 (file)
@@ -13,6 +13,9 @@
 #include <linux/kernel.h>
 #include <asm/opal.h>
 #include <asm/mce.h>
+#include <asm/machdep.h>
+#include <asm/cputhreads.h>
+#include <asm/hmi.h>
 
 /* SRR1 bits for machine check on POWER7 */
 #define SRR1_MC_LDSTERR                (1ul << (63-42))
@@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
 {
        return kvmppc_realmode_mc_power7(vcpu);
 }
+
+/* Check if dynamic split is in force and return subcore size accordingly. */
+static inline int kvmppc_cur_subcore_size(void)
+{
+       if (local_paca->kvm_hstate.kvm_split_mode)
+               return local_paca->kvm_hstate.kvm_split_mode->subcore_size;
+
+       return threads_per_subcore;
+}
+
+void kvmppc_subcore_enter_guest(void)
+{
+       int thread_id, subcore_id;
+
+       thread_id = cpu_thread_in_core(local_paca->paca_index);
+       subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+       local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
+}
+
+void kvmppc_subcore_exit_guest(void)
+{
+       int thread_id, subcore_id;
+
+       thread_id = cpu_thread_in_core(local_paca->paca_index);
+       subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+       local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
+}
+
+static bool kvmppc_tb_resync_required(void)
+{
+       if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT,
+                               &local_paca->sibling_subcore_state->flags))
+               return false;
+
+       return true;
+}
+
+static void kvmppc_tb_resync_done(void)
+{
+       clear_bit(CORE_TB_RESYNC_REQ_BIT,
+                       &local_paca->sibling_subcore_state->flags);
+}
+
+/*
+ * kvmppc_realmode_hmi_handler() is called only by primary thread during
+ * guest exit path.
+ *
+ * There are multiple reasons why HMI could occur, one of them is
+ * Timebase (TB) error. If this HMI is due to TB error, then TB would
+ * have been in stopped state. The opal hmi handler Will fix it and
+ * restore the TB value with host timebase value. For HMI caused due
+ * to non-TB errors, opal hmi handler will not touch/restore TB register
+ * and hence there won't be any change in TB value.
+ *
+ * Since we are not sure about the cause of this HMI, we can't be sure
+ * about the content of TB register whether it holds guest or host timebase
+ * value. Hence the idea is to resync the TB on every HMI, so that we
+ * know about the exact state of the TB value. Resync TB call will
+ * restore TB to host timebase.
+ *
+ * Things to consider:
+ * - On TB error, HMI interrupt is reported on all the threads of the core
+ *   that has encountered TB error irrespective of split-core mode.
+ * - The very first thread on the core that get chance to fix TB error
+ *   would rsync the TB with local chipTOD value.
+ * - The resync TB is a core level action i.e. it will sync all the TBs
+ *   in that core independent of split-core mode. This means if we trigger
+ *   TB sync from a thread from one subcore, it would affect TB values of
+ *   sibling subcores of the same core.
+ *
+ * All threads need to co-ordinate before making opal hmi handler.
+ * All threads will use sibling_subcore_state->in_guest[] (shared by all
+ * threads in the core) in paca which holds information about whether
+ * sibling subcores are in Guest mode or host mode. The in_guest[] array
+ * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset
+ * subcore status. Only primary threads from each subcore is responsible
+ * to set/unset its designated array element while entering/exiting the
+ * guset.
+ *
+ * After invoking opal hmi handler call, one of the thread (of entire core)
+ * will need to resync the TB. Bit 63 from subcore state bitmap flags
+ * (sibling_subcore_state->flags) will be used to co-ordinate between
+ * primary threads to decide who takes up the responsibility.
+ *
+ * This is what we do:
+ * - Primary thread from each subcore tries to set resync required bit[63]
+ *   of paca->sibling_subcore_state->flags.
+ * - The first primary thread that is able to set the flag takes the
+ *   responsibility of TB resync. (Let us call it as thread leader)
+ * - All other threads which are in host will call
+ *   wait_for_subcore_guest_exit() and wait for in_guest[0-3] from
+ *   paca->sibling_subcore_state to get cleared.
+ * - All the primary thread will clear its subcore status from subcore
+ *   state in_guest[] array respectively.
+ * - Once all primary threads clear in_guest[0-3], all of them will invoke
+ *   opal hmi handler.
+ * - Now all threads will wait for TB resync to complete by invoking
+ *   wait_for_tb_resync() except the thread leader.
+ * - Thread leader will do a TB resync by invoking opal_resync_timebase()
+ *   call and the it will clear the resync required bit.
+ * - All other threads will now come out of resync wait loop and proceed
+ *   with individual execution.
+ * - On return of this function, primary thread will signal all
+ *   secondary threads to proceed.
+ * - All secondary threads will eventually call opal hmi handler on
+ *   their exit path.
+ */
+
+long kvmppc_realmode_hmi_handler(void)
+{
+       int ptid = local_paca->kvm_hstate.ptid;
+       bool resync_req;
+
+       /* This is only called on primary thread. */
+       BUG_ON(ptid != 0);
+       __this_cpu_inc(irq_stat.hmi_exceptions);
+
+       /*
+        * By now primary thread has already completed guest->host
+        * partition switch but haven't signaled secondaries yet.
+        * All the secondary threads on this subcore is waiting
+        * for primary thread to signal them to go ahead.
+        *
+        * For threads from subcore which isn't in guest, they all will
+        * wait until all other subcores on this core exit the guest.
+        *
+        * Now set the resync required bit. If you are the first to
+        * set this bit then kvmppc_tb_resync_required() function will
+        * return true. For rest all other subcores
+        * kvmppc_tb_resync_required() will return false.
+        *
+        * If resync_req == true, then this thread is responsible to
+        * initiate TB resync after hmi handler has completed.
+        * All other threads on this core will wait until this thread
+        * clears the resync required bit flag.
+        */
+       resync_req = kvmppc_tb_resync_required();
+
+       /* Reset the subcore status to indicate it has exited guest */
+       kvmppc_subcore_exit_guest();
+
+       /*
+        * Wait for other subcores on this core to exit the guest.
+        * All the primary threads and threads from subcore that are
+        * not in guest will wait here until all subcores are out
+        * of guest context.
+        */
+       wait_for_subcore_guest_exit();
+
+       /*
+        * At this point we are sure that primary threads from each
+        * subcore on this core have completed guest->host partition
+        * switch. Now it is safe to call HMI handler.
+        */
+       if (ppc_md.hmi_exception_early)
+               ppc_md.hmi_exception_early(NULL);
+
+       /*
+        * Check if this thread is responsible to resync TB.
+        * All other threads will wait until this thread completes the
+        * TB resync.
+        */
+       if (resync_req) {
+               opal_resync_timebase();
+               /* Reset TB resync req bit */
+               kvmppc_tb_resync_done();
+       } else {
+               wait_for_tb_resync();
+       }
+       return 0;
+}
index 86f0cae..9756555 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/kvm_book3s_asm.h>
 #include <asm/book3s/64/mmu-hash.h>
 #include <asm/tm.h>
+#include <asm/opal.h>
 
 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
 
@@ -373,6 +374,18 @@ kvm_secondary_got_guest:
        lwsync
        std     r0, HSTATE_KVM_VCORE(r13)
 
+       /*
+        * All secondaries exiting guest will fall through this path.
+        * Before proceeding, just check for HMI interrupt and
+        * invoke opal hmi handler. By now we are sure that the
+        * primary thread on this core/subcore has already made partition
+        * switch/TB resync and we are good to call opal hmi handler.
+        */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     kvm_no_guest
+
+       li      r3,0                    /* NULL argument */
+       bl      hmi_exception_realmode
 /*
  * At this point we have finished executing in the guest.
  * We need to wait for hwthread_req to become zero, since
@@ -427,6 +440,22 @@ kvm_no_guest:
  * whole-core mode, so we need to nap.
  */
 kvm_unsplit_nap:
+       /*
+        * When secondaries are napping in kvm_unsplit_nap() with
+        * hwthread_req = 1, HMI goes ignored even though subcores are
+        * already exited the guest. Hence HMI keeps waking up secondaries
+        * from nap in a loop and secondaries always go back to nap since
+        * no vcore is assigned to them. This makes impossible for primary
+        * thread to get hold of secondary threads resulting into a soft
+        * lockup in KVM path.
+        *
+        * Let us check if HMI is pending and handle it before we go to nap.
+        */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     55f
+       li      r3, 0                   /* NULL argument */
+       bl      hmi_exception_realmode
+55:
        /*
         * Ensure that secondary doesn't nap when it has
         * its vcore pointer set.
@@ -601,6 +630,11 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_DPDES, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
+       /* Mark the subcore state as inside guest */
+       bl      kvmppc_subcore_enter_guest
+       nop
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       ld      r4, HSTATE_KVM_VCPU(r13)
        li      r0,1
        stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
 
@@ -655,112 +689,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
-       b       skip_tm
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-
-       /* Turn on TM/FP/VSX/VMX so we can restore them. */
-       mfmsr   r5
-       li      r6, MSR_TM >> 32
-       sldi    r6, r6, 32
-       or      r5, r5, r6
-       ori     r5, r5, MSR_FP
-       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
-       mtmsrd  r5
-
-       /*
-        * The user may change these outside of a transaction, so they must
-        * always be context switched.
-        */
-       ld      r5, VCPU_TFHAR(r4)
-       ld      r6, VCPU_TFIAR(r4)
-       ld      r7, VCPU_TEXASR(r4)
-       mtspr   SPRN_TFHAR, r5
-       mtspr   SPRN_TFIAR, r6
-       mtspr   SPRN_TEXASR, r7
-
-       ld      r5, VCPU_MSR(r4)
-       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beq     skip_tm /* TM not active in guest */
-
-       /* Make sure the failure summary is set, otherwise we'll program check
-        * when we trechkpt.  It's possible that this might have been not set
-        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
-        * host.
-        */
-       oris    r7, r7, (TEXASR_FS)@h
-       mtspr   SPRN_TEXASR, r7
-
-       /*
-        * We need to load up the checkpointed state for the guest.
-        * We need to do this early as it will blow away any GPRs, VSRs and
-        * some SPRs.
-        */
-
-       mr      r31, r4
-       addi    r3, r31, VCPU_FPRS_TM
-       bl      load_fp_state
-       addi    r3, r31, VCPU_VRS_TM
-       bl      load_vr_state
-       mr      r4, r31
-       lwz     r7, VCPU_VRSAVE_TM(r4)
-       mtspr   SPRN_VRSAVE, r7
-
-       ld      r5, VCPU_LR_TM(r4)
-       lwz     r6, VCPU_CR_TM(r4)
-       ld      r7, VCPU_CTR_TM(r4)
-       ld      r8, VCPU_AMR_TM(r4)
-       ld      r9, VCPU_TAR_TM(r4)
-       mtlr    r5
-       mtcr    r6
-       mtctr   r7
-       mtspr   SPRN_AMR, r8
-       mtspr   SPRN_TAR, r9
-
-       /*
-        * Load up PPR and DSCR values but don't put them in the actual SPRs
-        * till the last moment to avoid running with userspace PPR and DSCR for
-        * too long.
-        */
-       ld      r29, VCPU_DSCR_TM(r4)
-       ld      r30, VCPU_PPR_TM(r4)
-
-       std     r2, PACATMSCRATCH(r13) /* Save TOC */
-
-       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
-
-       /* Load GPRs r0-r28 */
-       reg = 0
-       .rept   29
-       ld      reg, VCPU_GPRS_TM(reg)(r31)
-       reg = reg + 1
-       .endr
-
-       mtspr   SPRN_DSCR, r29
-       mtspr   SPRN_PPR, r30
-
-       /* Load final GPRs */
-       ld      29, VCPU_GPRS_TM(29)(r31)
-       ld      30, VCPU_GPRS_TM(30)(r31)
-       ld      31, VCPU_GPRS_TM(31)(r31)
-
-       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
-       TRECHKPT
-
-       /* Now let's get back the state we need. */
-       HMT_MEDIUM
-       GET_PACA(r13)
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-       ld      r4, HSTATE_KVM_VCPU(r13)
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATMSCRATCH(r13)
-
-       /* Set the MSR RI since we have our registers back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
-skip_tm:
+       bl      kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
        /* Load guest PMU registers */
@@ -841,12 +771,6 @@ BEGIN_FTR_SECTION
        /* Skip next section on POWER7 */
        b       8f
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-       /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
-
        /* Load up POWER8-specific registers */
        ld      r5, VCPU_IAMR(r4)
        lwz     r6, VCPU_PSPB(r4)
@@ -1436,106 +1360,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
-       b       2f
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-       /* Turn on TM. */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
-
-       ld      r5, VCPU_MSR(r9)
-       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beq     1f      /* TM not active in guest. */
-
-       li      r3, TM_CAUSE_KVM_RESCHED
-
-       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
-
-       /* All GPRs are volatile at this point. */
-       TRECLAIM(R3)
-
-       /* Temporarily store r13 and r9 so we have some regs to play with */
-       SET_SCRATCH0(r13)
-       GET_PACA(r13)
-       std     r9, PACATMSCRATCH(r13)
-       ld      r9, HSTATE_KVM_VCPU(r13)
-
-       /* Get a few more GPRs free. */
-       std     r29, VCPU_GPRS_TM(29)(r9)
-       std     r30, VCPU_GPRS_TM(30)(r9)
-       std     r31, VCPU_GPRS_TM(31)(r9)
-
-       /* Save away PPR and DSCR soon so don't run with user values. */
-       mfspr   r31, SPRN_PPR
-       HMT_MEDIUM
-       mfspr   r30, SPRN_DSCR
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-
-       /* Save all but r9, r13 & r29-r31 */
-       reg = 0
-       .rept   29
-       .if (reg != 9) && (reg != 13)
-       std     reg, VCPU_GPRS_TM(reg)(r9)
-       .endif
-       reg = reg + 1
-       .endr
-       /* ... now save r13 */
-       GET_SCRATCH0(r4)
-       std     r4, VCPU_GPRS_TM(13)(r9)
-       /* ... and save r9 */
-       ld      r4, PACATMSCRATCH(r13)
-       std     r4, VCPU_GPRS_TM(9)(r9)
-
-       /* Reload stack pointer and TOC. */
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATOC(r13)
-
-       /* Set MSR RI now we have r1 and r13 back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
-
-       /* Save away checkpinted SPRs. */
-       std     r31, VCPU_PPR_TM(r9)
-       std     r30, VCPU_DSCR_TM(r9)
-       mflr    r5
-       mfcr    r6
-       mfctr   r7
-       mfspr   r8, SPRN_AMR
-       mfspr   r10, SPRN_TAR
-       std     r5, VCPU_LR_TM(r9)
-       stw     r6, VCPU_CR_TM(r9)
-       std     r7, VCPU_CTR_TM(r9)
-       std     r8, VCPU_AMR_TM(r9)
-       std     r10, VCPU_TAR_TM(r9)
-
-       /* Restore r12 as trap number. */
-       lwz     r12, VCPU_TRAP(r9)
-
-       /* Save FP/VSX. */
-       addi    r3, r9, VCPU_FPRS_TM
-       bl      store_fp_state
-       addi    r3, r9, VCPU_VRS_TM
-       bl      store_vr_state
-       mfspr   r6, SPRN_VRSAVE
-       stw     r6, VCPU_VRSAVE_TM(r9)
-1:
-       /*
-        * We need to save these SPRs after the treclaim so that the software
-        * error code is recorded correctly in the TEXASR.  Also the user may
-        * change these outside of a transaction, so they must always be
-        * context switched.
-        */
-       mfspr   r5, SPRN_TFHAR
-       mfspr   r6, SPRN_TFIAR
-       mfspr   r7, SPRN_TEXASR
-       std     r5, VCPU_TFHAR(r9)
-       std     r6, VCPU_TFIAR(r9)
-       std     r7, VCPU_TEXASR(r9)
-2:
+       bl      kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
        /* Increment yield count if they have a VPA */
@@ -1683,6 +1509,23 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_DPDES, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
+       /* If HMI, call kvmppc_realmode_hmi_handler() */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     27f
+       bl      kvmppc_realmode_hmi_handler
+       nop
+       li      r12, BOOK3S_INTERRUPT_HMI
+       /*
+        * At this point kvmppc_realmode_hmi_handler would have resync-ed
+        * the TB. Hence it is not required to subtract guest timebase
+        * offset from timebase. So, skip it.
+        *
+        * Also, do not call kvmppc_subcore_exit_guest() because it has
+        * been invoked as part of kvmppc_realmode_hmi_handler().
+        */
+       b       30f
+
+27:
        /* Subtract timebase offset from timebase */
        ld      r8,VCORE_TB_OFFSET(r5)
        cmpdi   r8,0
@@ -1698,8 +1541,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        addis   r8,r8,0x100             /* if so, increment upper 40 bits */
        mtspr   SPRN_TBU40,r8
 
+17:    bl      kvmppc_subcore_exit_guest
+       nop
+30:    ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r4,VCORE_KVM(r5)        /* pointer to struct kvm */
+
        /* Reset PCR */
-17:    ld      r0, VCORE_PCR(r5)
+       ld      r0, VCORE_PCR(r5)
        cmpdi   r0, 0
        beq     18f
        li      r0, 0
@@ -2245,6 +2093,13 @@ _GLOBAL(kvmppc_h_cede)           /* r3 = vcpu pointer, r11 = msr, r13 = paca */
        /* save FP state */
        bl      kvmppc_save_fp
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       bl      kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
        /*
         * Set DEC to the smaller of DEC and HDEC, so that we wake
         * no later than the end of our timeslice (HDEC interrupts
@@ -2321,6 +2176,12 @@ kvm_end_cede:
        bl      kvmhv_accumulate_time
 #endif
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+       bl      kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
        /* load up FP state */
        bl      kvmppc_load_fp
 
@@ -2461,6 +2322,8 @@ BEGIN_FTR_SECTION
        cmpwi   r6, 3                   /* hypervisor doorbell? */
        beq     3f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       cmpwi   r6, 0xa                 /* Hypervisor maintenance ? */
+       beq     4f
        li      r3, 1                   /* anything else, return 1 */
 0:     blr
 
@@ -2482,6 +2345,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        li      r3, -1
        blr
 
+       /* Woken up due to Hypervisor maintenance interrupt */
+4:     li      r12, BOOK3S_INTERRUPT_HMI
+       li      r3, 1
+       blr
+
 /*
  * Determine what sort of external interrupt is pending (if any).
  * Returns:
@@ -2631,6 +2499,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
        mr      r4,r31
        blr
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Save transactional state and TM-related registers.
+ * Called with r9 pointing to the vcpu struct.
+ * This can modify all checkpointed registers, but
+ * restores r1, r2 and r9 (vcpu pointer) before exit.
+ */
+kvmppc_save_tm:
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+
+       /* Turn on TM. */
+       mfmsr   r8
+       li      r0, 1
+       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+       mtmsrd  r8
+
+       ld      r5, VCPU_MSR(r9)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beq     1f      /* TM not active in guest. */
+
+       std     r1, HSTATE_HOST_R1(r13)
+       li      r3, TM_CAUSE_KVM_RESCHED
+
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+
+       /* All GPRs are volatile at this point. */
+       TRECLAIM(R3)
+
+       /* Temporarily store r13 and r9 so we have some regs to play with */
+       SET_SCRATCH0(r13)
+       GET_PACA(r13)
+       std     r9, PACATMSCRATCH(r13)
+       ld      r9, HSTATE_KVM_VCPU(r13)
+
+       /* Get a few more GPRs free. */
+       std     r29, VCPU_GPRS_TM(29)(r9)
+       std     r30, VCPU_GPRS_TM(30)(r9)
+       std     r31, VCPU_GPRS_TM(31)(r9)
+
+       /* Save away PPR and DSCR soon so don't run with user values. */
+       mfspr   r31, SPRN_PPR
+       HMT_MEDIUM
+       mfspr   r30, SPRN_DSCR
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+
+       /* Save all but r9, r13 & r29-r31 */
+       reg = 0
+       .rept   29
+       .if (reg != 9) && (reg != 13)
+       std     reg, VCPU_GPRS_TM(reg)(r9)
+       .endif
+       reg = reg + 1
+       .endr
+       /* ... now save r13 */
+       GET_SCRATCH0(r4)
+       std     r4, VCPU_GPRS_TM(13)(r9)
+       /* ... and save r9 */
+       ld      r4, PACATMSCRATCH(r13)
+       std     r4, VCPU_GPRS_TM(9)(r9)
+
+       /* Reload stack pointer and TOC. */
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATOC(r13)
+
+       /* Set MSR RI now we have r1 and r13 back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+
+       /* Save away checkpinted SPRs. */
+       std     r31, VCPU_PPR_TM(r9)
+       std     r30, VCPU_DSCR_TM(r9)
+       mflr    r5
+       mfcr    r6
+       mfctr   r7
+       mfspr   r8, SPRN_AMR
+       mfspr   r10, SPRN_TAR
+       std     r5, VCPU_LR_TM(r9)
+       stw     r6, VCPU_CR_TM(r9)
+       std     r7, VCPU_CTR_TM(r9)
+       std     r8, VCPU_AMR_TM(r9)
+       std     r10, VCPU_TAR_TM(r9)
+
+       /* Restore r12 as trap number. */
+       lwz     r12, VCPU_TRAP(r9)
+
+       /* Save FP/VSX. */
+       addi    r3, r9, VCPU_FPRS_TM
+       bl      store_fp_state
+       addi    r3, r9, VCPU_VRS_TM
+       bl      store_vr_state
+       mfspr   r6, SPRN_VRSAVE
+       stw     r6, VCPU_VRSAVE_TM(r9)
+1:
+       /*
+        * We need to save these SPRs after the treclaim so that the software
+        * error code is recorded correctly in the TEXASR.  Also the user may
+        * change these outside of a transaction, so they must always be
+        * context switched.
+        */
+       mfspr   r5, SPRN_TFHAR
+       mfspr   r6, SPRN_TFIAR
+       mfspr   r7, SPRN_TEXASR
+       std     r5, VCPU_TFHAR(r9)
+       std     r6, VCPU_TFIAR(r9)
+       std     r7, VCPU_TEXASR(r9)
+
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
+
+/*
+ * Restore transactional state and TM-related registers.
+ * Called with r4 pointing to the vcpu struct.
+ * This potentially modifies all checkpointed registers.
+ * It restores r1, r2, r4 from the PACA.
+ */
+kvmppc_restore_tm:
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+
+       /* Turn on TM/FP/VSX/VMX so we can restore them. */
+       mfmsr   r5
+       li      r6, MSR_TM >> 32
+       sldi    r6, r6, 32
+       or      r5, r5, r6
+       ori     r5, r5, MSR_FP
+       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
+       mtmsrd  r5
+
+       /*
+        * The user may change these outside of a transaction, so they must
+        * always be context switched.
+        */
+       ld      r5, VCPU_TFHAR(r4)
+       ld      r6, VCPU_TFIAR(r4)
+       ld      r7, VCPU_TEXASR(r4)
+       mtspr   SPRN_TFHAR, r5
+       mtspr   SPRN_TFIAR, r6
+       mtspr   SPRN_TEXASR, r7
+
+       ld      r5, VCPU_MSR(r4)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beqlr           /* TM not active in guest */
+       std     r1, HSTATE_HOST_R1(r13)
+
+       /* Make sure the failure summary is set, otherwise we'll program check
+        * when we trechkpt.  It's possible that this might have been not set
+        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+        * host.
+        */
+       oris    r7, r7, (TEXASR_FS)@h
+       mtspr   SPRN_TEXASR, r7
+
+       /*
+        * We need to load up the checkpointed state for the guest.
+        * We need to do this early as it will blow away any GPRs, VSRs and
+        * some SPRs.
+        */
+
+       mr      r31, r4
+       addi    r3, r31, VCPU_FPRS_TM
+       bl      load_fp_state
+       addi    r3, r31, VCPU_VRS_TM
+       bl      load_vr_state
+       mr      r4, r31
+       lwz     r7, VCPU_VRSAVE_TM(r4)
+       mtspr   SPRN_VRSAVE, r7
+
+       ld      r5, VCPU_LR_TM(r4)
+       lwz     r6, VCPU_CR_TM(r4)
+       ld      r7, VCPU_CTR_TM(r4)
+       ld      r8, VCPU_AMR_TM(r4)
+       ld      r9, VCPU_TAR_TM(r4)
+       mtlr    r5
+       mtcr    r6
+       mtctr   r7
+       mtspr   SPRN_AMR, r8
+       mtspr   SPRN_TAR, r9
+
+       /*
+        * Load up PPR and DSCR values but don't put them in the actual SPRs
+        * till the last moment to avoid running with userspace PPR and DSCR for
+        * too long.
+        */
+       ld      r29, VCPU_DSCR_TM(r4)
+       ld      r30, VCPU_PPR_TM(r4)
+
+       std     r2, PACATMSCRATCH(r13) /* Save TOC */
+
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+
+       /* Load GPRs r0-r28 */
+       reg = 0
+       .rept   29
+       ld      reg, VCPU_GPRS_TM(reg)(r31)
+       reg = reg + 1
+       .endr
+
+       mtspr   SPRN_DSCR, r29
+       mtspr   SPRN_PPR, r30
+
+       /* Load final GPRs */
+       ld      29, VCPU_GPRS_TM(29)(r31)
+       ld      30, VCPU_GPRS_TM(30)(r31)
+       ld      31, VCPU_GPRS_TM(31)(r31)
+
+       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
+       TRECHKPT
+
+       /* Now let's get back the state we need. */
+       HMT_MEDIUM
+       GET_PACA(r13)
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATMSCRATCH(r13)
+
+       /* Set the MSR RI since we have our registers back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
+#endif
+
 /*
  * We come here if we get any exception or interrupt while we are
  * executing host real mode code while in guest MMU context.
index c4f7d6b..e76f79a 100644 (file)
@@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
        /* We get here with MSR.EE=1 */
 
        trace_kvm_exit(exit_nr, vcpu);
-       kvm_guest_exit();
+       guest_exit();
 
        switch (exit_nr) {
        case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                int emul;
 
 program_interrupt:
-               flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+               /*
+                * shadow_srr1 only contains valid flags if we came here via
+                * a program exception. The other exceptions (emulation assist,
+                * FP unavailable, etc.) do not provide flags in SRR1, so use
+                * an illegal-instruction exception when injecting a program
+                * interrupt into the guest.
+                */
+               if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+                       flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+               else
+                       flags = SRR1_PROGILL;
 
                emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
                if (emul != EMULATE_DONE) {
@@ -1531,7 +1541,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        kvmppc_clear_debug(vcpu);
 
-       /* No need for kvm_guest_exit. It's done in handle_exit.
+       /* No need for guest_exit. It's done in handle_exit.
           We also get here with interrupts enabled. */
 
        /* Make sure we save the guest FPU/Altivec/VSX state */
index 4afae69..02b4672 100644 (file)
@@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 
-       /* No need for kvm_guest_exit. It's done in handle_exit.
+       /* No need for guest_exit. It's done in handle_exit.
           We also get here with interrupts enabled. */
 
        /* Switch back to user space debug context */
@@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        }
 
        trace_kvm_exit(exit_nr, vcpu);
-       __kvm_guest_exit();
+       guest_exit_irqoff();
 
        local_irq_enable();
 
index 5cc2e7a..b379146 100644 (file)
@@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        advance = 0;
                        printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
                               "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
-                       kvmppc_core_queue_program(vcpu, 0);
                }
        }
 
index 6249cdc..ed38f81 100644 (file)
@@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
        return 0;
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
 {
        int r = -EINVAL;
index 02416fe..6ce40dd 100644 (file)
@@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
                        continue;
                }
 
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                return 1;
        }
 
@@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = 1;
                break;
 #endif
+       case KVM_CAP_PPC_HTM:
+               r = cpu_has_feature(CPU_FTR_TM_COMP) &&
+                   is_kvmppc_hv_enabled(kvm);
+               break;
        default:
                r = 0;
                break;
index cf928bb..3d29d40 100644 (file)
@@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1);                                                \
        OPAL_BRANCH(opal_tracepoint_entry) \
        mfcr    r12;                    \
        stw     r12,8(r1);              \
-       std     r1,PACAR1(r13);         \
        li      r11,0;                  \
        mfmsr   r12;                    \
        ori     r11,r11,MSR_EE;         \
@@ -127,7 +126,6 @@ opal_tracepoint_entry:
        mfcr    r12
        std     r11,16(r1)
        stw     r12,8(r1)
-       std     r1,PACAR1(r13)
        li      r11,0
        mfmsr   r12
        ori     r11,r11,MSR_EE
index 13723c3..33ba697 100644 (file)
@@ -33,10 +33,10 @@ quiet_cmd_sizes = GEN $@
 $(obj)/sizes.h: vmlinux
        $(call if_changed,sizes)
 
-AFLAGS_head.o += -I$(obj)
+AFLAGS_head.o += -I$(objtree)/$(obj)
 $(obj)/head.o: $(obj)/sizes.h
 
-CFLAGS_misc.o += -I$(obj)
+CFLAGS_misc.o += -I$(objtree)/$(obj)
 $(obj)/misc.o: $(obj)/sizes.h
 
 OBJCOPYFLAGS_vmlinux.bin :=  -R .comment -S
index 67d43a0..28f03ca 100644 (file)
 #include <asm/ebcdic.h>
 #include "hypfs.h"
 
-#define LPAR_NAME_LEN 8                /* lpar name len in diag 204 data */
-#define CPU_NAME_LEN 16                /* type name len of cpus in diag224 name table */
 #define TMP_SIZE 64            /* size of temporary buffers */
 
 #define DBFS_D204_HDR_VERSION  0
 
-/* diag 204 subcodes */
-enum diag204_sc {
-       SUBC_STIB4 = 4,
-       SUBC_RSI = 5,
-       SUBC_STIB6 = 6,
-       SUBC_STIB7 = 7
-};
-
-/* The two available diag 204 data formats */
-enum diag204_format {
-       INFO_SIMPLE = 0,
-       INFO_EXT = 0x00010000
-};
-
-/* bit is set in flags, when physical cpu info is included in diag 204 data */
-#define LPAR_PHYS_FLG  0x80
-
 static char *diag224_cpu_names;                        /* diag 224 name table */
 static enum diag204_sc diag204_store_sc;       /* used subcode for store */
 static enum diag204_format diag204_info_type;  /* used diag 204 data format */
@@ -53,7 +34,7 @@ static int diag204_buf_pages;         /* number of pages for diag204 data */
 static struct dentry *dbfs_d204_file;
 
 /*
- * DIAG 204 data structures and member access functions.
+ * DIAG 204 member access functions.
  *
  * Since we have two different diag 204 data formats for old and new s390
  * machines, we do not access the structs directly, but use getter functions for
@@ -62,304 +43,173 @@ static struct dentry *dbfs_d204_file;
 
 /* Time information block */
 
-struct info_blk_hdr {
-       __u8  npar;
-       __u8  flags;
-       __u16 tslice;
-       __u16 phys_cpus;
-       __u16 this_part;
-       __u64 curtod;
-} __attribute__ ((packed));
-
-struct x_info_blk_hdr {
-       __u8  npar;
-       __u8  flags;
-       __u16 tslice;
-       __u16 phys_cpus;
-       __u16 this_part;
-       __u64 curtod1;
-       __u64 curtod2;
-       char reserved[40];
-} __attribute__ ((packed));
-
 static inline int info_blk_hdr__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct info_blk_hdr);
-       else /* INFO_EXT */
-               return sizeof(struct x_info_blk_hdr);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_info_blk_hdr);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_info_blk_hdr);
 }
 
 static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct info_blk_hdr *)hdr)->npar;
-       else /* INFO_EXT */
-               return ((struct x_info_blk_hdr *)hdr)->npar;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_info_blk_hdr *)hdr)->npar;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
 }
 
 static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct info_blk_hdr *)hdr)->flags;
-       else /* INFO_EXT */
-               return ((struct x_info_blk_hdr *)hdr)->flags;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_info_blk_hdr *)hdr)->flags;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
 }
 
 static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct info_blk_hdr *)hdr)->phys_cpus;
-       else /* INFO_EXT */
-               return ((struct x_info_blk_hdr *)hdr)->phys_cpus;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus;
 }
 
 /* Partition header */
 
-struct part_hdr {
-       __u8 pn;
-       __u8 cpus;
-       char reserved[6];
-       char part_name[LPAR_NAME_LEN];
-} __attribute__ ((packed));
-
-struct x_part_hdr {
-       __u8  pn;
-       __u8  cpus;
-       __u8  rcpus;
-       __u8  pflag;
-       __u32 mlu;
-       char  part_name[LPAR_NAME_LEN];
-       char  lpc_name[8];
-       char  os_name[8];
-       __u64 online_cs;
-       __u64 online_es;
-       __u8  upid;
-       char  reserved1[3];
-       __u32 group_mlu;
-       char  group_name[8];
-       char  reserved2[32];
-} __attribute__ ((packed));
-
 static inline int part_hdr__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct part_hdr);
-       else /* INFO_EXT */
-               return sizeof(struct x_part_hdr);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_part_hdr);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_part_hdr);
 }
 
 static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct part_hdr *)hdr)->cpus;
-       else /* INFO_EXT */
-               return ((struct x_part_hdr *)hdr)->rcpus;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_part_hdr *)hdr)->cpus;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_part_hdr *)hdr)->rcpus;
 }
 
 static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
                                       char *name)
 {
-       if (type == INFO_SIMPLE)
-               memcpy(name, ((struct part_hdr *)hdr)->part_name,
-                      LPAR_NAME_LEN);
-       else /* INFO_EXT */
-               memcpy(name, ((struct x_part_hdr *)hdr)->part_name,
-                      LPAR_NAME_LEN);
-       EBCASC(name, LPAR_NAME_LEN);
-       name[LPAR_NAME_LEN] = 0;
+       if (type == DIAG204_INFO_SIMPLE)
+               memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+                      DIAG204_LPAR_NAME_LEN);
+       else /* DIAG204_INFO_EXT */
+               memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+                      DIAG204_LPAR_NAME_LEN);
+       EBCASC(name, DIAG204_LPAR_NAME_LEN);
+       name[DIAG204_LPAR_NAME_LEN] = 0;
        strim(name);
 }
 
-struct cpu_info {
-       __u16 cpu_addr;
-       char  reserved1[2];
-       __u8  ctidx;
-       __u8  cflag;
-       __u16 weight;
-       __u64 acc_time;
-       __u64 lp_time;
-} __attribute__ ((packed));
-
-struct x_cpu_info {
-       __u16 cpu_addr;
-       char  reserved1[2];
-       __u8  ctidx;
-       __u8  cflag;
-       __u16 weight;
-       __u64 acc_time;
-       __u64 lp_time;
-       __u16 min_weight;
-       __u16 cur_weight;
-       __u16 max_weight;
-       char  reseved2[2];
-       __u64 online_time;
-       __u64 wait_time;
-       __u32 pma_weight;
-       __u32 polar_weight;
-       char  reserved3[40];
-} __attribute__ ((packed));
-
 /* CPU info block */
 
 static inline int cpu_info__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct cpu_info);
-       else /* INFO_EXT */
-               return sizeof(struct x_cpu_info);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_cpu_info);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_cpu_info);
 }
 
 static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct cpu_info *)hdr)->ctidx;
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->ctidx;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_cpu_info *)hdr)->ctidx;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->ctidx;
 }
 
 static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct cpu_info *)hdr)->cpu_addr;
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->cpu_addr;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
 }
 
 static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct cpu_info *)hdr)->acc_time;
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->acc_time;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_cpu_info *)hdr)->acc_time;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->acc_time;
 }
 
 static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct cpu_info *)hdr)->lp_time;
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->lp_time;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_cpu_info *)hdr)->lp_time;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->lp_time;
 }
 
 static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
+       if (type == DIAG204_INFO_SIMPLE)
                return 0;       /* online_time not available in simple info */
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->online_time;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->online_time;
 }
 
 /* Physical header */
 
-struct phys_hdr {
-       char reserved1[1];
-       __u8 cpus;
-       char reserved2[6];
-       char mgm_name[8];
-} __attribute__ ((packed));
-
-struct x_phys_hdr {
-       char reserved1[1];
-       __u8 cpus;
-       char reserved2[6];
-       char mgm_name[8];
-       char reserved3[80];
-} __attribute__ ((packed));
-
 static inline int phys_hdr__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct phys_hdr);
-       else /* INFO_EXT */
-               return sizeof(struct x_phys_hdr);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_phys_hdr);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_phys_hdr);
 }
 
 static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct phys_hdr *)hdr)->cpus;
-       else /* INFO_EXT */
-               return ((struct x_phys_hdr *)hdr)->cpus;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_phys_hdr *)hdr)->cpus;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_phys_hdr *)hdr)->cpus;
 }
 
 /* Physical CPU info block */
 
-struct phys_cpu {
-       __u16 cpu_addr;
-       char  reserved1[2];
-       __u8  ctidx;
-       char  reserved2[3];
-       __u64 mgm_time;
-       char  reserved3[8];
-} __attribute__ ((packed));
-
-struct x_phys_cpu {
-       __u16 cpu_addr;
-       char  reserved1[2];
-       __u8  ctidx;
-       char  reserved2[3];
-       __u64 mgm_time;
-       char  reserved3[80];
-} __attribute__ ((packed));
-
 static inline int phys_cpu__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct phys_cpu);
-       else /* INFO_EXT */
-               return sizeof(struct x_phys_cpu);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_phys_cpu);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_phys_cpu);
 }
 
 static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct phys_cpu *)hdr)->cpu_addr;
-       else /* INFO_EXT */
-               return ((struct x_phys_cpu *)hdr)->cpu_addr;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
 }
 
 static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct phys_cpu *)hdr)->mgm_time;
-       else /* INFO_EXT */
-               return ((struct x_phys_cpu *)hdr)->mgm_time;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
 }
 
 static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct phys_cpu *)hdr)->ctidx;
-       else /* INFO_EXT */
-               return ((struct x_phys_cpu *)hdr)->ctidx;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_phys_cpu *)hdr)->ctidx;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
 }
 
 /* Diagnose 204 functions */
-
-static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
-{
-       register unsigned long _subcode asm("0") = *subcode;
-       register unsigned long _size asm("1") = size;
-
-       asm volatile(
-               "       diag    %2,%0,0x204\n"
-               "0:     nopr    %%r7\n"
-               EX_TABLE(0b,0b)
-               : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
-       *subcode = _subcode;
-       return _size;
-}
-
-static int diag204(unsigned long subcode, unsigned long size, void *addr)
-{
-       diag_stat_inc(DIAG_STAT_X204);
-       size = __diag204(&subcode, size, addr);
-       if (subcode)
-               return -1;
-       return size;
-}
-
 /*
  * For the old diag subcode 4 with simple data format we have to use real
  * memory. If we use subcode 6 or 7 with extended data format, we can (and
@@ -411,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
                *pages = diag204_buf_pages;
                return diag204_buf;
        }
-       if (fmt == INFO_SIMPLE) {
+       if (fmt == DIAG204_INFO_SIMPLE) {
                *pages = 1;
                return diag204_alloc_rbuf();
-       } else {/* INFO_EXT */
-               *pages = diag204((unsigned long)SUBC_RSI |
-                                (unsigned long)INFO_EXT, 0, NULL);
+       } else {/* DIAG204_INFO_EXT */
+               *pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+                                (unsigned long)DIAG204_INFO_EXT, 0, NULL);
                if (*pages <= 0)
                        return ERR_PTR(-ENOSYS);
                else
@@ -443,18 +293,18 @@ static int diag204_probe(void)
        void *buf;
        int pages, rc;
 
-       buf = diag204_get_buffer(INFO_EXT, &pages);
+       buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages);
        if (!IS_ERR(buf)) {
-               if (diag204((unsigned long)SUBC_STIB7 |
-                           (unsigned long)INFO_EXT, pages, buf) >= 0) {
-                       diag204_store_sc = SUBC_STIB7;
-                       diag204_info_type = INFO_EXT;
+               if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
+                           (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+                       diag204_store_sc = DIAG204_SUBC_STIB7;
+                       diag204_info_type = DIAG204_INFO_EXT;
                        goto out;
                }
-               if (diag204((unsigned long)SUBC_STIB6 |
-                           (unsigned long)INFO_EXT, pages, buf) >= 0) {
-                       diag204_store_sc = SUBC_STIB6;
-                       diag204_info_type = INFO_EXT;
+               if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
+                           (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+                       diag204_store_sc = DIAG204_SUBC_STIB6;
+                       diag204_info_type = DIAG204_INFO_EXT;
                        goto out;
                }
                diag204_free_buffer();
@@ -462,15 +312,15 @@ static int diag204_probe(void)
 
        /* subcodes 6 and 7 failed, now try subcode 4 */
 
-       buf = diag204_get_buffer(INFO_SIMPLE, &pages);
+       buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages);
        if (IS_ERR(buf)) {
                rc = PTR_ERR(buf);
                goto fail_alloc;
        }
-       if (diag204((unsigned long)SUBC_STIB4 |
-                   (unsigned long)INFO_SIMPLE, pages, buf) >= 0) {
-               diag204_store_sc = SUBC_STIB4;
-               diag204_info_type = INFO_SIMPLE;
+       if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
+                   (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
+               diag204_store_sc = DIAG204_SUBC_STIB4;
+               diag204_info_type = DIAG204_INFO_SIMPLE;
                goto out;
        } else {
                rc = -ENOSYS;
@@ -510,20 +360,6 @@ out:
 
 /* Diagnose 224 functions */
 
-static int diag224(void *ptr)
-{
-       int rc = -EOPNOTSUPP;
-
-       diag_stat_inc(DIAG_STAT_X224);
-       asm volatile(
-               "       diag    %1,%2,0x224\n"
-               "0:     lhi     %0,0x0\n"
-               "1:\n"
-               EX_TABLE(0b,1b)
-               : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
-       return rc;
-}
-
 static int diag224_get_name_table(void)
 {
        /* memory must be below 2GB */
@@ -545,9 +381,9 @@ static void diag224_delete_name_table(void)
 
 static int diag224_idx2name(int index, char *name)
 {
-       memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN),
-               CPU_NAME_LEN);
-       name[CPU_NAME_LEN] = 0;
+       memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+              DIAG204_CPU_NAME_LEN);
+       name[DIAG204_CPU_NAME_LEN] = 0;
        strim(name);
        return 0;
 }
@@ -603,7 +439,7 @@ __init int hypfs_diag_init(void)
                pr_err("The hardware system does not support hypfs\n");
                return -ENODATA;
        }
-       if (diag204_info_type == INFO_EXT) {
+       if (diag204_info_type == DIAG204_INFO_EXT) {
                rc = hypfs_dbfs_create_file(&dbfs_file_d204);
                if (rc)
                        return rc;
@@ -651,7 +487,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
                              cpu_info__lp_time(diag204_info_type, cpu_info));
        if (IS_ERR(rc))
                return PTR_ERR(rc);
-       if (diag204_info_type == INFO_EXT) {
+       if (diag204_info_type == DIAG204_INFO_EXT) {
                rc = hypfs_create_u64(cpu_dir, "onlinetime",
                                      cpu_info__online_time(diag204_info_type,
                                                            cpu_info));
@@ -667,12 +503,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
 {
        struct dentry *cpus_dir;
        struct dentry *lpar_dir;
-       char lpar_name[LPAR_NAME_LEN + 1];
+       char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
        void *cpu_info;
        int i;
 
        part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
-       lpar_name[LPAR_NAME_LEN] = 0;
+       lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
        lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
        if (IS_ERR(lpar_dir))
                return lpar_dir;
@@ -755,7 +591,8 @@ int hypfs_diag_create_files(struct dentry *root)
                        goto err_out;
                }
        }
-       if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) {
+       if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
+           DIAG204_LPAR_PHYS_FLG) {
                ptr = hypfs_create_phys_files(root, part_hdr);
                if (IS_ERR(ptr)) {
                        rc = PTR_ERR(ptr);
index 1a82cf2..d28621d 100644 (file)
@@ -20,6 +20,9 @@
 #define CPACF_KMC              0xb92f          /* MSA  */
 #define CPACF_KIMD             0xb93e          /* MSA  */
 #define CPACF_KLMD             0xb93f          /* MSA  */
+#define CPACF_PCKMO            0xb928          /* MSA3 */
+#define CPACF_KMF              0xb92a          /* MSA4 */
+#define CPACF_KMO              0xb92b          /* MSA4 */
 #define CPACF_PCC              0xb92c          /* MSA4 */
 #define CPACF_KMCTR            0xb92d          /* MSA4 */
 #define CPACF_PPNO             0xb93c          /* MSA5 */
@@ -136,6 +139,7 @@ static inline void __cpacf_query(unsigned int opcode, unsigned char *status)
        register unsigned long r1 asm("1") = (unsigned long) status;
 
        asm volatile(
+               "       spm 0\n" /* pckmo doesn't change the cc */
                /* Parameter registers are ignored, but may not be 0 */
                "0:     .insn   rrf,%[opc] << 16,2,2,2,0\n"
                "       brc     1,0b\n" /* handle partial completion */
@@ -157,6 +161,12 @@ static inline int cpacf_query(unsigned int opcode, unsigned int func)
                if (!test_facility(17)) /* check for MSA */
                        return 0;
                break;
+       case CPACF_PCKMO:
+               if (!test_facility(76)) /* check for MSA3 */
+                       return 0;
+               break;
+       case CPACF_KMF:
+       case CPACF_KMO:
        case CPACF_PCC:
        case CPACF_KMCTR:
                if (!test_facility(77)) /* check for MSA4 */
index 86cae09..8acf482 100644 (file)
@@ -78,4 +78,153 @@ struct diag210 {
 
 extern int diag210(struct diag210 *addr);
 
+/* bit is set in flags, when physical cpu info is included in diag 204 data */
+#define DIAG204_LPAR_PHYS_FLG 0x80
+#define DIAG204_LPAR_NAME_LEN 8                /* lpar name len in diag 204 data */
+#define DIAG204_CPU_NAME_LEN 16                /* type name len of cpus in diag224 name table */
+
+/* diag 204 subcodes */
+enum diag204_sc {
+       DIAG204_SUBC_STIB4 = 4,
+       DIAG204_SUBC_RSI = 5,
+       DIAG204_SUBC_STIB6 = 6,
+       DIAG204_SUBC_STIB7 = 7
+};
+
+/* The two available diag 204 data formats */
+enum diag204_format {
+       DIAG204_INFO_SIMPLE = 0,
+       DIAG204_INFO_EXT = 0x00010000
+};
+
+enum diag204_cpu_flags {
+       DIAG204_CPU_ONLINE = 0x20,
+       DIAG204_CPU_CAPPED = 0x40,
+};
+
+struct diag204_info_blk_hdr {
+       __u8  npar;
+       __u8  flags;
+       __u16 tslice;
+       __u16 phys_cpus;
+       __u16 this_part;
+       __u64 curtod;
+} __packed;
+
+struct diag204_x_info_blk_hdr {
+       __u8  npar;
+       __u8  flags;
+       __u16 tslice;
+       __u16 phys_cpus;
+       __u16 this_part;
+       __u64 curtod1;
+       __u64 curtod2;
+       char reserved[40];
+} __packed;
+
+struct diag204_part_hdr {
+       __u8 pn;
+       __u8 cpus;
+       char reserved[6];
+       char part_name[DIAG204_LPAR_NAME_LEN];
+} __packed;
+
+struct diag204_x_part_hdr {
+       __u8  pn;
+       __u8  cpus;
+       __u8  rcpus;
+       __u8  pflag;
+       __u32 mlu;
+       char  part_name[DIAG204_LPAR_NAME_LEN];
+       char  lpc_name[8];
+       char  os_name[8];
+       __u64 online_cs;
+       __u64 online_es;
+       __u8  upid;
+       __u8  reserved:3;
+       __u8  mtid:5;
+       char  reserved1[2];
+       __u32 group_mlu;
+       char  group_name[8];
+       char  hardware_group_name[8];
+       char  reserved2[24];
+} __packed;
+
+struct diag204_cpu_info {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       __u8  cflag;
+       __u16 weight;
+       __u64 acc_time;
+       __u64 lp_time;
+} __packed;
+
+struct diag204_x_cpu_info {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       __u8  cflag;
+       __u16 weight;
+       __u64 acc_time;
+       __u64 lp_time;
+       __u16 min_weight;
+       __u16 cur_weight;
+       __u16 max_weight;
+       char  reseved2[2];
+       __u64 online_time;
+       __u64 wait_time;
+       __u32 pma_weight;
+       __u32 polar_weight;
+       __u32 cpu_type_cap;
+       __u32 group_cpu_type_cap;
+       char  reserved3[32];
+} __packed;
+
+struct diag204_phys_hdr {
+       char reserved1[1];
+       __u8 cpus;
+       char reserved2[6];
+       char mgm_name[8];
+} __packed;
+
+struct diag204_x_phys_hdr {
+       char reserved1[1];
+       __u8 cpus;
+       char reserved2[6];
+       char mgm_name[8];
+       char reserved3[80];
+} __packed;
+
+struct diag204_phys_cpu {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       char  reserved2[3];
+       __u64 mgm_time;
+       char  reserved3[8];
+} __packed;
+
+struct diag204_x_phys_cpu {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       char  reserved2[1];
+       __u16 weight;
+       __u64 mgm_time;
+       char  reserved3[80];
+} __packed;
+
+struct diag204_x_part_block {
+       struct diag204_x_part_hdr hdr;
+       struct diag204_x_cpu_info cpus[];
+} __packed;
+
+struct diag204_x_phys_block {
+       struct diag204_x_phys_hdr hdr;
+       struct diag204_x_phys_cpu cpus[];
+} __packed;
+
+int diag204(unsigned long subcode, unsigned long size, void *addr);
+int diag224(void *ptr);
 #endif /* _ASM_S390_DIAG_H */
index d054c1b..741ddba 100644 (file)
 
 /**
  * struct gmap_struct - guest address space
+ * @list: list head for the mm->context gmap list
  * @crst_list: list of all crst tables used in the guest address space
  * @mm: pointer to the parent mm_struct
  * @guest_to_host: radix tree with guest to host address translation
  * @host_to_guest: radix tree with pointer to segment table entries
  * @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @ref_count: reference counter for the gmap structure
  * @table: pointer to the page directory
  * @asce: address space control element for gmap page table
  * @pfault_enabled: defines if pfaults are applicable for the guest
+ * @host_to_rmap: radix tree with gmap_rmap lists
+ * @children: list of shadow gmap structures
+ * @pt_list: list of all page tables used in the shadow guest address space
+ * @shadow_lock: spinlock to protect the shadow gmap list
+ * @parent: pointer to the parent gmap for shadow guest address spaces
+ * @orig_asce: ASCE for which the shadow page table has been created
+ * @edat_level: edat level to be used for the shadow translation
+ * @removed: flag to indicate if a shadow guest address space has been removed
+ * @initialized: flag to indicate if a shadow guest address space can be used
  */
 struct gmap {
        struct list_head list;
@@ -26,26 +37,64 @@ struct gmap {
        struct radix_tree_root guest_to_host;
        struct radix_tree_root host_to_guest;
        spinlock_t guest_table_lock;
+       atomic_t ref_count;
        unsigned long *table;
        unsigned long asce;
        unsigned long asce_end;
        void *private;
        bool pfault_enabled;
+       /* Additional data for shadow guest address spaces */
+       struct radix_tree_root host_to_rmap;
+       struct list_head children;
+       struct list_head pt_list;
+       spinlock_t shadow_lock;
+       struct gmap *parent;
+       unsigned long orig_asce;
+       int edat_level;
+       bool removed;
+       bool initialized;
 };
 
+/**
+ * struct gmap_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @raddr: virtual rmap address in the shadow guest address space
+ */
+struct gmap_rmap {
+       struct gmap_rmap *next;
+       unsigned long raddr;
+};
+
+#define gmap_for_each_rmap(pos, head) \
+       for (pos = (head); pos; pos = pos->next)
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+       for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
 /**
  * struct gmap_notifier - notify function block for page invalidation
  * @notifier_call: address of callback function
  */
 struct gmap_notifier {
        struct list_head list;
-       void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+       struct rcu_head rcu;
+       void (*notifier_call)(struct gmap *gmap, unsigned long start,
+                             unsigned long end);
 };
 
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
+static inline int gmap_is_shadow(struct gmap *gmap)
+{
+       return !!gmap->parent;
+}
+
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
+void gmap_remove(struct gmap *gmap);
+struct gmap *gmap_get(struct gmap *gmap);
+void gmap_put(struct gmap *gmap);
+
 void gmap_enable(struct gmap *gmap);
 void gmap_disable(struct gmap *gmap);
+struct gmap *gmap_get_enabled(void);
 int gmap_map_segment(struct gmap *gmap, unsigned long from,
                     unsigned long to, unsigned long len);
 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
@@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
 void __gmap_zap(struct gmap *, unsigned long gaddr);
 void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
 
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
+
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+                        int edat_level);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+                   int fake);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+                   int fake);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+                   int fake);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+                   int fake);
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+                          unsigned long *pgt, int *dat_protection, int *fake);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
+
+void gmap_register_pte_notifier(struct gmap_notifier *);
+void gmap_unregister_pte_notifier(struct gmap_notifier *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
+                    unsigned long bits);
+
+int gmap_mprotect_notify(struct gmap *, unsigned long start,
+                        unsigned long len, int prot);
 
 #endif /* _ASM_S390_GMAP_H */
index ac82e8e..8e5daf7 100644 (file)
@@ -43,6 +43,7 @@
 /* s390-specific vcpu->requests bit members */
 #define KVM_REQ_ENABLE_IBS         8
 #define KVM_REQ_DISABLE_IBS        9
+#define KVM_REQ_ICPT_OPEREXC       10
 
 #define SIGP_CTRL_C            0x80
 #define SIGP_CTRL_SCN_MASK     0x3f
@@ -145,7 +146,7 @@ struct kvm_s390_sie_block {
        __u64   cputm;                  /* 0x0028 */
        __u64   ckc;                    /* 0x0030 */
        __u64   epoch;                  /* 0x0038 */
-       __u8    reserved40[4];          /* 0x0040 */
+       __u32   svcc;                   /* 0x0040 */
 #define LCTL_CR0       0x8000
 #define LCTL_CR6       0x0200
 #define LCTL_CR9       0x0040
@@ -154,6 +155,7 @@ struct kvm_s390_sie_block {
 #define LCTL_CR14      0x0002
        __u16   lctl;                   /* 0x0044 */
        __s16   icpua;                  /* 0x0046 */
+#define ICTL_OPEREXC   0x80000000
 #define ICTL_PINT      0x20000000
 #define ICTL_LPSW      0x00400000
 #define ICTL_STCTL     0x00040000
@@ -166,6 +168,9 @@ struct kvm_s390_sie_block {
 #define ICPT_INST      0x04
 #define ICPT_PROGI     0x08
 #define ICPT_INSTPROGI 0x0C
+#define ICPT_EXTINT    0x14
+#define ICPT_VALIDITY  0x20
+#define ICPT_STOP      0x28
 #define ICPT_OPEREXC   0x2C
 #define ICPT_PARTEXEC  0x38
 #define ICPT_IOINST    0x40
@@ -185,7 +190,9 @@ struct kvm_s390_sie_block {
        __u32   scaol;                  /* 0x0064 */
        __u8    reserved68[4];          /* 0x0068 */
        __u32   todpr;                  /* 0x006c */
-       __u8    reserved70[32];         /* 0x0070 */
+       __u8    reserved70[16];         /* 0x0070 */
+       __u64   mso;                    /* 0x0080 */
+       __u64   msl;                    /* 0x0088 */
        psw_t   gpsw;                   /* 0x0090 */
        __u64   gg14;                   /* 0x00a0 */
        __u64   gg15;                   /* 0x00a8 */
@@ -223,7 +230,7 @@ struct kvm_s390_sie_block {
        __u8    reserved1e6[2];         /* 0x01e6 */
        __u64   itdba;                  /* 0x01e8 */
        __u64   riccbd;                 /* 0x01f0 */
-       __u8    reserved1f8[8];         /* 0x01f8 */
+       __u64   gvrd;                   /* 0x01f8 */
 } __attribute__((packed));
 
 struct kvm_s390_itdb {
@@ -256,6 +263,7 @@ struct kvm_vcpu_stat {
        u32 instruction_stctg;
        u32 exit_program_interruption;
        u32 exit_instr_and_program;
+       u32 exit_operation_exception;
        u32 deliver_external_call;
        u32 deliver_emergency_signal;
        u32 deliver_service_signal;
@@ -278,7 +286,9 @@ struct kvm_vcpu_stat {
        u32 instruction_stsi;
        u32 instruction_stfl;
        u32 instruction_tprot;
+       u32 instruction_sie;
        u32 instruction_essa;
+       u32 instruction_sthyi;
        u32 instruction_sigp_sense;
        u32 instruction_sigp_sense_running;
        u32 instruction_sigp_external_call;
@@ -541,12 +551,16 @@ struct kvm_guestdbg_info_arch {
 
 struct kvm_vcpu_arch {
        struct kvm_s390_sie_block *sie_block;
+       /* if vsie is active, currently executed shadow sie control block */
+       struct kvm_s390_sie_block *vsie_block;
        unsigned int      host_acrs[NUM_ACRS];
        struct fpu        host_fpregs;
        struct kvm_s390_local_interrupt local_int;
        struct hrtimer    ckc_timer;
        struct kvm_s390_pgm_info pgm;
        struct gmap *gmap;
+       /* backup location for the currently enabled gmap when scheduled out */
+       struct gmap *enabled_gmap;
        struct kvm_guestdbg_info_arch guestdbg;
        unsigned long pfault_token;
        unsigned long pfault_select;
@@ -631,6 +645,14 @@ struct sie_page2 {
        u8 reserved900[0x1000 - 0x900];                 /* 0x0900 */
 } __packed;
 
+struct kvm_s390_vsie {
+       struct mutex mutex;
+       struct radix_tree_root addr_to_page;
+       int page_count;
+       int next;
+       struct page *pages[KVM_MAX_VCPUS];
+};
+
 struct kvm_arch{
        void *sca;
        int use_esca;
@@ -646,15 +668,20 @@ struct kvm_arch{
        int user_cpu_state_ctrl;
        int user_sigp;
        int user_stsi;
+       int user_instr0;
        struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
        wait_queue_head_t ipte_wq;
        int ipte_lock_count;
        struct mutex ipte_mutex;
+       struct ratelimit_state sthyi_limit;
        spinlock_t start_stop_lock;
        struct sie_page2 *sie_page2;
        struct kvm_s390_cpu_model model;
        struct kvm_s390_crypto crypto;
+       struct kvm_s390_vsie vsie;
        u64 epoch;
+       /* subset of available cpu features enabled by user space */
+       DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 };
 
 #define KVM_HVA_ERR_BAD                (-1UL)
index 1822643..6d39329 100644 (file)
@@ -8,8 +8,9 @@ typedef struct {
        cpumask_t cpu_attach_mask;
        atomic_t flush_count;
        unsigned int flush_mm;
-       spinlock_t list_lock;
+       spinlock_t pgtable_lock;
        struct list_head pgtable_list;
+       spinlock_t gmap_lock;
        struct list_head gmap_list;
        unsigned long asce;
        unsigned long asce_limit;
@@ -22,9 +23,11 @@ typedef struct {
        unsigned int use_skey:1;
 } mm_context_t;
 
-#define INIT_MM_CONTEXT(name)                                                \
-       .context.list_lock    = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
-       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list),    \
+#define INIT_MM_CONTEXT(name)                                             \
+       .context.pgtable_lock =                                            \
+                       __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock),   \
+       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+       .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
        .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
 
 static inline int tprot(unsigned long addr)
index f77c638..c6a088c 100644 (file)
@@ -15,8 +15,9 @@
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
 {
-       spin_lock_init(&mm->context.list_lock);
+       spin_lock_init(&mm->context.pgtable_lock);
        INIT_LIST_HEAD(&mm->context.pgtable_list);
+       spin_lock_init(&mm->context.gmap_lock);
        INIT_LIST_HEAD(&mm->context.gmap_list);
        cpumask_clear(&mm->context.cpu_attach_mask);
        atomic_set(&mm->context.flush_count, 0);
index b2146c4..69b8a41 100644 (file)
@@ -111,13 +111,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr)
 
 static inline int page_reset_referenced(unsigned long addr)
 {
-       unsigned int ipm;
+       int cc;
 
        asm volatile(
                "       rrbe    0,%1\n"
                "       ipm     %0\n"
-               : "=d" (ipm) : "a" (addr) : "cc");
-       return !!(ipm & 0x20000000);
+               "       srl     %0,28\n"
+               : "=d" (cc) : "a" (addr) : "cc");
+       return cc;
 }
 
 /* Bits int the storage key */
@@ -148,6 +149,8 @@ static inline int devmem_is_allowed(unsigned long pfn)
 #define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
 #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn)       __va((pfn) << PAGE_SHIFT)
+#define page_to_virt(page)     pfn_to_virt(page_to_pfn(page))
 
 #define VM_DATA_DEFAULT_FLAGS  (VM_READ | VM_WRITE | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
index da34cb6..f4eb984 100644 (file)
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
 void crst_table_free(struct mm_struct *, unsigned long *);
 
 unsigned long *page_table_alloc(struct mm_struct *);
+struct page *page_table_alloc_pgste(struct mm_struct *mm);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+void page_table_free_pgste(struct page *page);
 extern int page_table_allocate_pgste;
 
 static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
index 48d383a..72c7f60 100644 (file)
@@ -277,6 +277,7 @@ static inline int is_module_addr(void *addr)
 /* Bits in the region table entry */
 #define _REGION_ENTRY_ORIGIN   ~0xfffUL/* region/segment table origin      */
 #define _REGION_ENTRY_PROTECT  0x200   /* region protection bit            */
+#define _REGION_ENTRY_OFFSET   0xc0    /* region table offset              */
 #define _REGION_ENTRY_INVALID  0x20    /* invalid region table entry       */
 #define _REGION_ENTRY_TYPE_MASK        0x0c    /* region/segment table type mask   */
 #define _REGION_ENTRY_TYPE_R1  0x0c    /* region first table type          */
@@ -364,6 +365,7 @@ static inline int is_module_addr(void *addr)
 #define PGSTE_GC_BIT   0x0002000000000000UL
 #define PGSTE_UC_BIT   0x0000800000000000UL    /* user dirty (migration) */
 #define PGSTE_IN_BIT   0x0000400000000000UL    /* IPTE notify bit */
+#define PGSTE_VSIE_BIT 0x0000200000000000UL    /* ref'd in a shadow table */
 
 /* Guest Page State used for virtualization */
 #define _PGSTE_GPS_ZERO                0x0000000080000000UL
@@ -1002,15 +1004,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry);
 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr,
+                pte_t *ptep, unsigned long bits);
+int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+                   pte_t *ptep, int prot, unsigned long bit);
 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep , int reset);
 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte);
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
 
 bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
                          unsigned char key, bool nq);
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                              unsigned char key, unsigned char *oldkey,
+                              bool nq, bool mr, bool mc);
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                         unsigned char *key);
 
 /*
  * Certain architectures need to do special things when PTEs
index 0952920..0332317 100644 (file)
@@ -112,6 +112,8 @@ struct thread_struct {
         unsigned long ksp;              /* kernel stack pointer             */
        mm_segment_t mm_segment;
        unsigned long gmap_addr;        /* address of last gmap fault. */
+       unsigned int gmap_write_flag;   /* gmap fault write indication */
+       unsigned int gmap_int_code;     /* int code of last gmap fault */
        unsigned int gmap_pfault;       /* signal of a pending guest pfault */
        struct per_regs per_user;       /* User specified PER registers */
        struct per_event per_event;     /* Cause of the last PER trap */
index e4f6f73..2ad9c20 100644 (file)
@@ -32,12 +32,19 @@ struct sclp_core_entry {
        u8 reserved0;
        u8 : 4;
        u8 sief2 : 1;
-       u8 : 3;
-       u8 : 3;
+       u8 skey : 1;
+       u8 : 2;
+       u8 : 2;
+       u8 gpere : 1;
        u8 siif : 1;
        u8 sigpif : 1;
        u8 : 3;
-       u8 reserved2[10];
+       u8 reserved2[3];
+       u8 : 2;
+       u8 ib : 1;
+       u8 cei : 1;
+       u8 : 4;
+       u8 reserved3[6];
        u8 type;
        u8 reserved1;
 } __attribute__((packed));
@@ -59,6 +66,15 @@ struct sclp_info {
        unsigned char has_hvs : 1;
        unsigned char has_esca : 1;
        unsigned char has_sief2 : 1;
+       unsigned char has_64bscao : 1;
+       unsigned char has_gpere : 1;
+       unsigned char has_cmma : 1;
+       unsigned char has_gsls : 1;
+       unsigned char has_ib : 1;
+       unsigned char has_cei : 1;
+       unsigned char has_pfmfi : 1;
+       unsigned char has_ibs : 1;
+       unsigned char has_skey : 1;
        unsigned int ibc;
        unsigned int mtid;
        unsigned int mtid_cp;
@@ -101,5 +117,6 @@ int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
 int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
 void sclp_early_detect(void);
 void _sclp_print_early(const char *);
+void sclp_ocf_cpc_name_copy(char *dst);
 
 #endif /* _ASM_S390_SCLP_H */
index 3b8e99e..a2ffec4 100644 (file)
@@ -93,6 +93,47 @@ struct kvm_s390_vm_cpu_machine {
        __u64 fac_list[256];
 };
 
+#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2
+#define KVM_S390_VM_CPU_MACHINE_FEAT   3
+
+#define KVM_S390_VM_CPU_FEAT_NR_BITS   1024
+#define KVM_S390_VM_CPU_FEAT_ESOP      0
+#define KVM_S390_VM_CPU_FEAT_SIEF2     1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO   2
+#define KVM_S390_VM_CPU_FEAT_SIIF      3
+#define KVM_S390_VM_CPU_FEAT_GPERE     4
+#define KVM_S390_VM_CPU_FEAT_GSLS      5
+#define KVM_S390_VM_CPU_FEAT_IB                6
+#define KVM_S390_VM_CPU_FEAT_CEI       7
+#define KVM_S390_VM_CPU_FEAT_IBS       8
+#define KVM_S390_VM_CPU_FEAT_SKEY      9
+#define KVM_S390_VM_CPU_FEAT_CMMA      10
+#define KVM_S390_VM_CPU_FEAT_PFMFI     11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF    12
+struct kvm_s390_vm_cpu_feat {
+       __u64 feat[16];
+};
+
+#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC      4
+#define KVM_S390_VM_CPU_MACHINE_SUBFUNC                5
+/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */
+struct kvm_s390_vm_cpu_subfunc {
+       __u8 plo[32];           /* always */
+       __u8 ptff[16];          /* with TOD-clock steering */
+       __u8 kmac[16];          /* with MSA */
+       __u8 kmc[16];           /* with MSA */
+       __u8 km[16];            /* with MSA */
+       __u8 kimd[16];          /* with MSA */
+       __u8 klmd[16];          /* with MSA */
+       __u8 pckmo[16];         /* with MSA3 */
+       __u8 kmctr[16];         /* with MSA4 */
+       __u8 kmf[16];           /* with MSA4 */
+       __u8 kmo[16];           /* with MSA4 */
+       __u8 pcc[16];           /* with MSA4 */
+       __u8 ppno[16];          /* with MSA5 */
+       __u8 reserved[1824];
+};
+
 /* kvm attributes for crypto */
 #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW       0
 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW       1
index 8fb5d4a..3ac6343 100644 (file)
        exit_code_ipa0(0xB2, 0x4c, "TAR"),      \
        exit_code_ipa0(0xB2, 0x50, "CSP"),      \
        exit_code_ipa0(0xB2, 0x54, "MVPG"),     \
+       exit_code_ipa0(0xB2, 0x56, "STHYI"),    \
        exit_code_ipa0(0xB2, 0x58, "BSG"),      \
        exit_code_ipa0(0xB2, 0x5a, "BSA"),      \
        exit_code_ipa0(0xB2, 0x5f, "CHSC"),     \
index 48b37b8..a97354c 100644 (file)
@@ -162,6 +162,30 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
 }
 EXPORT_SYMBOL(diag14);
 
+static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
+{
+       register unsigned long _subcode asm("0") = *subcode;
+       register unsigned long _size asm("1") = size;
+
+       asm volatile(
+               "       diag    %2,%0,0x204\n"
+               "0:     nopr    %%r7\n"
+               EX_TABLE(0b,0b)
+               : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
+       *subcode = _subcode;
+       return _size;
+}
+
+int diag204(unsigned long subcode, unsigned long size, void *addr)
+{
+       diag_stat_inc(DIAG_STAT_X204);
+       size = __diag204(&subcode, size, addr);
+       if (subcode)
+               return -1;
+       return size;
+}
+EXPORT_SYMBOL(diag204);
+
 /*
  * Diagnose 210: Get information about a virtual device
  */
@@ -196,3 +220,18 @@ int diag210(struct diag210 *addr)
        return ccode;
 }
 EXPORT_SYMBOL(diag210);
+
+int diag224(void *ptr)
+{
+       int rc = -EOPNOTSUPP;
+
+       diag_stat_inc(DIAG_STAT_X224);
+       asm volatile(
+               "       diag    %1,%2,0x224\n"
+               "0:     lhi     %0,0x0\n"
+               "1:\n"
+               EX_TABLE(0b,1b)
+               : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+       return rc;
+}
+EXPORT_SYMBOL(diag224);
index d42fa38..09a9e6d 100644 (file)
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
 
 obj-$(CONFIG_KVM) += kvm.o
index 1ea4095..ce865bd 100644 (file)
@@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
            (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
                return -EOPNOTSUPP;
 
+       VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx",
+                           (u32) vcpu->run->s.regs.gprs[2],
+                           (u32) vcpu->run->s.regs.gprs[3],
+                           vcpu->run->s.regs.gprs[4]);
+
        /*
         * The layout is as follows:
         * - gpr 2 contains the subchannel id (passed as addr)
index 66938d2..5420020 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/vmalloc.h>
 #include <linux/err.h>
 #include <asm/pgtable.h>
+#include <asm/gmap.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include <asm/switch_to.h>
@@ -476,18 +477,73 @@ enum {
        FSI_FETCH   = 2  /* Exception was due to fetch operation */
 };
 
-static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
-                        ar_t ar, enum gacc_mode mode)
+enum prot_type {
+       PROT_TYPE_LA   = 0,
+       PROT_TYPE_KEYC = 1,
+       PROT_TYPE_ALC  = 2,
+       PROT_TYPE_DAT  = 3,
+};
+
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
+                    ar_t ar, enum gacc_mode mode, enum prot_type prot)
 {
-       int rc;
-       struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
-       struct trans_exc_code_bits *tec_bits;
+       struct trans_exc_code_bits *tec;
 
        memset(pgm, 0, sizeof(*pgm));
-       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
-       tec_bits->as = psw.as;
+       pgm->code = code;
+       tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+
+       switch (code) {
+       case PGM_ASCE_TYPE:
+       case PGM_PAGE_TRANSLATION:
+       case PGM_REGION_FIRST_TRANS:
+       case PGM_REGION_SECOND_TRANS:
+       case PGM_REGION_THIRD_TRANS:
+       case PGM_SEGMENT_TRANSLATION:
+               /*
+                * op_access_id only applies to MOVE_PAGE -> set bit 61
+                * exc_access_id has to be set to 0 for some instructions. Both
+                * cases have to be handled by the caller. We can always store
+                * exc_access_id, as it is undefined for non-ar cases.
+                */
+               tec->addr = gva >> PAGE_SHIFT;
+               tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+               tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+               /* FALL THROUGH */
+       case PGM_ALEN_TRANSLATION:
+       case PGM_ALE_SEQUENCE:
+       case PGM_ASTE_VALIDITY:
+       case PGM_ASTE_SEQUENCE:
+       case PGM_EXTENDED_AUTHORITY:
+               pgm->exc_access_id = ar;
+               break;
+       case PGM_PROTECTION:
+               switch (prot) {
+               case PROT_TYPE_ALC:
+                       tec->b60 = 1;
+                       /* FALL THROUGH */
+               case PROT_TYPE_DAT:
+                       tec->b61 = 1;
+                       tec->addr = gva >> PAGE_SHIFT;
+                       tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+                       tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+                       /* exc_access_id is undefined for most cases */
+                       pgm->exc_access_id = ar;
+                       break;
+               default: /* LA and KEYC set b61 to 0, other params undefined */
+                       break;
+               }
+               break;
+       }
+       return code;
+}
+
+static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
+                        unsigned long ga, ar_t ar, enum gacc_mode mode)
+{
+       int rc;
+       struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
 
        if (!psw.t) {
                asce->val = 0;
@@ -510,21 +566,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
                return 0;
        case PSW_AS_ACCREG:
                rc = ar_translation(vcpu, asce, ar, mode);
-               switch (rc) {
-               case PGM_ALEN_TRANSLATION:
-               case PGM_ALE_SEQUENCE:
-               case PGM_ASTE_VALIDITY:
-               case PGM_ASTE_SEQUENCE:
-               case PGM_EXTENDED_AUTHORITY:
-                       vcpu->arch.pgm.exc_access_id = ar;
-                       break;
-               case PGM_PROTECTION:
-                       tec_bits->b60 = 1;
-                       tec_bits->b61 = 1;
-                       break;
-               }
                if (rc > 0)
-                       pgm->code = rc;
+                       return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC);
                return rc;
        }
        return 0;
@@ -729,40 +772,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
        return 1;
 }
 
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
                            unsigned long *pages, unsigned long nr_pages,
                            const union asce asce, enum gacc_mode mode)
 {
-       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       struct trans_exc_code_bits *tec_bits;
-       int lap_enabled, rc;
+       int lap_enabled, rc = 0;
 
-       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
        lap_enabled = low_address_protection_enabled(vcpu, asce);
        while (nr_pages) {
                ga = kvm_s390_logical_to_effective(vcpu, ga);
-               tec_bits->addr = ga >> PAGE_SHIFT;
-               if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
-                       pgm->code = PGM_PROTECTION;
-                       return pgm->code;
-               }
+               if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
+                       return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
+                                        PROT_TYPE_LA);
                ga &= PAGE_MASK;
                if (psw_bits(*psw).t) {
                        rc = guest_translate(vcpu, ga, pages, asce, mode);
                        if (rc < 0)
                                return rc;
-                       if (rc == PGM_PROTECTION)
-                               tec_bits->b61 = 1;
-                       if (rc)
-                               pgm->code = rc;
                } else {
                        *pages = kvm_s390_real_to_abs(vcpu, ga);
                        if (kvm_is_error_gpa(vcpu->kvm, *pages))
-                               pgm->code = PGM_ADDRESSING;
+                               rc = PGM_ADDRESSING;
                }
-               if (pgm->code)
-                       return pgm->code;
+               if (rc)
+                       return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
                ga += PAGE_SIZE;
                pages++;
                nr_pages--;
@@ -783,7 +817,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 
        if (!len)
                return 0;
-       rc = get_vcpu_asce(vcpu, &asce, ar, mode);
+       ga = kvm_s390_logical_to_effective(vcpu, ga);
+       rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode);
        if (rc)
                return rc;
        nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -795,7 +830,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
        need_ipte_lock = psw_bits(*psw).t && !asce.r;
        if (need_ipte_lock)
                ipte_lock(vcpu);
-       rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
+       rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
        for (idx = 0; idx < nr_pages && !rc; idx++) {
                gpa = *(pages + idx) + (ga & ~PAGE_MASK);
                _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
@@ -846,37 +881,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
                            unsigned long *gpa, enum gacc_mode mode)
 {
-       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       struct trans_exc_code_bits *tec;
        union asce asce;
        int rc;
 
        gva = kvm_s390_logical_to_effective(vcpu, gva);
-       tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       rc = get_vcpu_asce(vcpu, &asce, ar, mode);
-       tec->addr = gva >> PAGE_SHIFT;
+       rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
        if (rc)
                return rc;
        if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
-               if (mode == GACC_STORE) {
-                       rc = pgm->code = PGM_PROTECTION;
-                       return rc;
-               }
+               if (mode == GACC_STORE)
+                       return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
+                                        mode, PROT_TYPE_LA);
        }
 
        if (psw_bits(*psw).t && !asce.r) {      /* Use DAT? */
                rc = guest_translate(vcpu, gva, gpa, asce, mode);
-               if (rc > 0) {
-                       if (rc == PGM_PROTECTION)
-                               tec->b61 = 1;
-                       pgm->code = rc;
-               }
+               if (rc > 0)
+                       return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
        } else {
-               rc = 0;
                *gpa = kvm_s390_real_to_abs(vcpu, gva);
                if (kvm_is_error_gpa(vcpu->kvm, *gpa))
-                       rc = pgm->code = PGM_ADDRESSING;
+                       return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
        }
 
        return rc;
@@ -915,20 +941,247 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
  */
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
 {
-       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       struct trans_exc_code_bits *tec_bits;
        union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
 
        if (!ctlreg0.lap || !is_low_address(gra))
                return 0;
+       return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
+}
 
-       memset(pgm, 0, sizeof(*pgm));
-       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       tec_bits->fsi = FSI_STORE;
-       tec_bits->as = psw_bits(*psw).as;
-       tec_bits->addr = gra >> PAGE_SHIFT;
-       pgm->code = PGM_PROTECTION;
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+                                 unsigned long *pgt, int *dat_protection,
+                                 int *fake)
+{
+       struct gmap *parent;
+       union asce asce;
+       union vaddress vaddr;
+       unsigned long ptr;
+       int rc;
+
+       *fake = 0;
+       *dat_protection = 0;
+       parent = sg->parent;
+       vaddr.addr = saddr;
+       asce.val = sg->orig_asce;
+       ptr = asce.origin * 4096;
+       if (asce.r) {
+               *fake = 1;
+               asce.dt = ASCE_TYPE_REGION1;
+       }
+       switch (asce.dt) {
+       case ASCE_TYPE_REGION1:
+               if (vaddr.rfx01 > asce.tl && !asce.r)
+                       return PGM_REGION_FIRST_TRANS;
+               break;
+       case ASCE_TYPE_REGION2:
+               if (vaddr.rfx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.rsx01 > asce.tl)
+                       return PGM_REGION_SECOND_TRANS;
+               break;
+       case ASCE_TYPE_REGION3:
+               if (vaddr.rfx || vaddr.rsx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.rtx01 > asce.tl)
+                       return PGM_REGION_THIRD_TRANS;
+               break;
+       case ASCE_TYPE_SEGMENT:
+               if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.sx01 > asce.tl)
+                       return PGM_SEGMENT_TRANSLATION;
+               break;
+       }
+
+       switch (asce.dt) {
+       case ASCE_TYPE_REGION1: {
+               union region1_table_entry rfte;
 
-       return pgm->code;
+               if (*fake) {
+                       /* offset in 16EB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+                       rfte.val = ptr;
+                       goto shadow_r2t;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+               if (rc)
+                       return rc;
+               if (rfte.i)
+                       return PGM_REGION_FIRST_TRANS;
+               if (rfte.tt != TABLE_TYPE_REGION1)
+                       return PGM_TRANSLATION_SPEC;
+               if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+                       return PGM_REGION_SECOND_TRANS;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rfte.p;
+               ptr = rfte.rto << 12UL;
+shadow_r2t:
+               rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_REGION2: {
+               union region2_table_entry rste;
+
+               if (*fake) {
+                       /* offset in 8PB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+                       rste.val = ptr;
+                       goto shadow_r3t;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+               if (rc)
+                       return rc;
+               if (rste.i)
+                       return PGM_REGION_SECOND_TRANS;
+               if (rste.tt != TABLE_TYPE_REGION2)
+                       return PGM_TRANSLATION_SPEC;
+               if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+                       return PGM_REGION_THIRD_TRANS;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rste.p;
+               ptr = rste.rto << 12UL;
+shadow_r3t:
+               rste.p |= *dat_protection;
+               rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_REGION3: {
+               union region3_table_entry rtte;
+
+               if (*fake) {
+                       /* offset in 4TB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+                       rtte.val = ptr;
+                       goto shadow_sgt;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+               if (rc)
+                       return rc;
+               if (rtte.i)
+                       return PGM_REGION_THIRD_TRANS;
+               if (rtte.tt != TABLE_TYPE_REGION3)
+                       return PGM_TRANSLATION_SPEC;
+               if (rtte.cr && asce.p && sg->edat_level >= 2)
+                       return PGM_TRANSLATION_SPEC;
+               if (rtte.fc && sg->edat_level >= 2) {
+                       *dat_protection |= rtte.fc0.p;
+                       *fake = 1;
+                       ptr = rtte.fc1.rfaa << 31UL;
+                       rtte.val = ptr;
+                       goto shadow_sgt;
+               }
+               if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rtte.fc0.p;
+               ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+               rtte.fc0.p |= *dat_protection;
+               rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_SEGMENT: {
+               union segment_table_entry ste;
+
+               if (*fake) {
+                       /* offset in 2G guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+                       ste.val = ptr;
+                       goto shadow_pgt;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+               if (rc)
+                       return rc;
+               if (ste.i)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (ste.tt != TABLE_TYPE_SEGMENT)
+                       return PGM_TRANSLATION_SPEC;
+               if (ste.cs && asce.p)
+                       return PGM_TRANSLATION_SPEC;
+               *dat_protection |= ste.fc0.p;
+               if (ste.fc && sg->edat_level >= 1) {
+                       *fake = 1;
+                       ptr = ste.fc1.sfaa << 20UL;
+                       ste.val = ptr;
+                       goto shadow_pgt;
+               }
+               ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+               ste.fc0.p |= *dat_protection;
+               rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+               if (rc)
+                       return rc;
+       }
+       }
+       /* Return the parent address of the page table */
+       *pgt = ptr;
+       return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ *         - > 0 (pgm exception code) on exceptions while faulting
+ *         - -EAGAIN if the caller can retry immediately
+ *         - -EFAULT when accessing invalid guest addresses
+ *         - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+                         unsigned long saddr)
+{
+       union vaddress vaddr;
+       union page_table_entry pte;
+       unsigned long pgt;
+       int dat_protection, fake;
+       int rc;
+
+       down_read(&sg->mm->mmap_sem);
+       /*
+        * We don't want any guest-2 tables to change - so the parent
+        * tables/pointers we read stay valid - unshadowing is however
+        * always possible - only guest_table_lock protects us.
+        */
+       ipte_lock(vcpu);
+
+       rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+       if (rc)
+               rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+                                           &fake);
+
+       vaddr.addr = saddr;
+       if (fake) {
+               /* offset in 1MB guest memory block */
+               pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+               goto shadow_page;
+       }
+       if (!rc)
+               rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+       if (!rc && pte.i)
+               rc = PGM_PAGE_TRANSLATION;
+       if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+               rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+       pte.p |= dat_protection;
+       if (!rc)
+               rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+       ipte_unlock(vcpu);
+       up_read(&sg->mm->mmap_sem);
+       return rc;
 }
index df0a79d..8756569 100644 (file)
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+                         unsigned long saddr);
+
 #endif /* __KVM_S390_GACCESS_H */
index e8c6843..31a0533 100644 (file)
@@ -439,6 +439,23 @@ exit_required:
 #define guest_per_enabled(vcpu) \
                             (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
 
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
+{
+       const u8 ilen = kvm_s390_get_ilen(vcpu);
+       struct kvm_s390_pgm_info pgm_info = {
+               .code = PGM_PER,
+               .per_code = PER_EVENT_IFETCH >> 24,
+               .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
+       };
+
+       /*
+        * The PSW points to the next instruction, therefore the intercepted
+        * instruction generated a PER i-fetch event. PER address therefore
+        * points at the previous PSW address (could be an EXECUTE function).
+        */
+       return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+}
+
 static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 {
        u32 perc = vcpu->arch.sie_block->perc << 24;
@@ -465,7 +482,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
                guest_perc &= ~PER_EVENT_IFETCH;
 
        /* All other PER events will be given to the guest */
-       /* TODO: Check alterated address/address space */
+       /* TODO: Check altered address/address space */
 
        vcpu->arch.sie_block->perc = guest_perc >> 24;
 
index 2521571..dfd0ca2 100644 (file)
@@ -351,8 +351,26 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
        return -EOPNOTSUPP;
 }
 
+static int handle_operexc(struct kvm_vcpu *vcpu)
+{
+       vcpu->stat.exit_operation_exception++;
+       trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+                                     vcpu->arch.sie_block->ipb);
+
+       if (vcpu->arch.sie_block->ipa == 0xb256 &&
+           test_kvm_facility(vcpu->kvm, 74))
+               return handle_sthyi(vcpu);
+
+       if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+               return -EOPNOTSUPP;
+
+       return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
+       int rc, per_rc = 0;
+
        if (kvm_is_ucontrol(vcpu->kvm))
                return -EOPNOTSUPP;
 
@@ -361,7 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
        case 0x18:
                return handle_noop(vcpu);
        case 0x04:
-               return handle_instruction(vcpu);
+               rc = handle_instruction(vcpu);
+               break;
        case 0x08:
                return handle_prog(vcpu);
        case 0x14:
@@ -372,9 +391,19 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
                return handle_validity(vcpu);
        case 0x28:
                return handle_stop(vcpu);
+       case 0x2c:
+               rc = handle_operexc(vcpu);
+               break;
        case 0x38:
-               return handle_partial_execution(vcpu);
+               rc = handle_partial_execution(vcpu);
+               break;
        default:
                return -EOPNOTSUPP;
        }
+
+       /* process PER, also if the instrution is processed in user space */
+       if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+           (!rc || rc == -EOPNOTSUPP))
+               per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+       return per_rc ? per_rc : rc;
 }
index 5a80af7..24524c0 100644 (file)
@@ -28,9 +28,6 @@
 #include "gaccess.h"
 #include "trace-s390.h"
 
-#define IOINT_SCHID_MASK 0x0000ffff
-#define IOINT_SSID_MASK 0x00030000
-#define IOINT_CSSID_MASK 0x03fc0000
 #define PFAULT_INIT 0x0600
 #define PFAULT_DONE 0x0680
 #define VIRTIO_PARAM 0x0d00
@@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
                                        struct kvm_s390_interrupt_info,
                                        list);
        if (inti) {
-               VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type);
+               if (inti->type & KVM_S390_INT_IO_AI_MASK)
+                       VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)");
+               else
+                       VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x",
+                       inti->io.subchannel_id >> 8,
+                       inti->io.subchannel_id >> 1 & 0x3,
+                       inti->io.subchannel_nr);
+
                vcpu->stat.deliver_io_int++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
                                inti->type,
@@ -991,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
                swake_up(&vcpu->wq);
                vcpu->stat.halt_wakeup++;
        }
+       /*
+        * The VCPU might not be sleeping but is executing the VSIE. Let's
+        * kick it, so it leaves the SIE to process the request.
+        */
+       kvm_s390_vsie_kick(vcpu);
 }
 
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
@@ -1415,6 +1424,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
        }
        fi->counters[FIRQ_CNTR_IO] += 1;
 
+       if (inti->type & KVM_S390_INT_IO_AI_MASK)
+               VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)");
+       else
+               VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x",
+                       inti->io.subchannel_id >> 8,
+                       inti->io.subchannel_id >> 1 & 0x3,
+                       inti->io.subchannel_nr);
        isc = int_word_to_isc(inti->io.io_int_word);
        list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
        list_add_tail(&inti->list, list);
@@ -1531,13 +1547,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
                inti->mchk.mcic = s390int->parm64;
                break;
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               if (inti->type & KVM_S390_INT_IO_AI_MASK)
-                       VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
-               else
-                       VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
-                                s390int->type & IOINT_CSSID_MASK,
-                                s390int->type & IOINT_SSID_MASK,
-                                s390int->type & IOINT_SCHID_MASK);
                inti->io.subchannel_id = s390int->parm >> 16;
                inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
                inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -2237,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
        return ret;
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
 {
        int ret;
index 6f5c344..3f3ae48 100644 (file)
 #include <linux/init.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/mman.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
+#include <linux/bitmap.h>
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/stp.h>
@@ -35,6 +37,8 @@
 #include <asm/switch_to.h>
 #include <asm/isc.h>
 #include <asm/sclp.h>
+#include <asm/cpacf.h>
+#include <asm/timex.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 
@@ -64,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "exit_pei", VCPU_STAT(exit_pei) },
        { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
        { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+       { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
        { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
        { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
        { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@ -94,6 +99,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "instruction_stsi", VCPU_STAT(instruction_stsi) },
        { "instruction_stfl", VCPU_STAT(instruction_stfl) },
        { "instruction_tprot", VCPU_STAT(instruction_tprot) },
+       { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+       { "instruction_sie", VCPU_STAT(instruction_sie) },
        { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
        { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
        { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -119,6 +126,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
 
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[16] = {
        0xffe6000000000000UL,
@@ -131,7 +143,13 @@ unsigned long kvm_s390_fac_list_mask_size(void)
        return ARRAY_SIZE(kvm_s390_fac_list_mask);
 }
 
+/* available cpu features supported by kvm */
+static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+/* available subfunctions indicated via query / "test bit" */
+static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
+
 static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
 debug_info_t *kvm_s390_dbf;
 
 /* Section: not file related */
@@ -141,7 +159,8 @@ int kvm_arch_hardware_enable(void)
        return 0;
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end);
 
 /*
  * This callback is executed during stop_machine(). All CPUs are therefore
@@ -163,6 +182,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
                        vcpu->arch.sie_block->epoch -= *delta;
                        if (vcpu->arch.cputm_enabled)
                                vcpu->arch.cputm_start += *delta;
+                       if (vcpu->arch.vsie_block)
+                               vcpu->arch.vsie_block->epoch -= *delta;
                }
        }
        return NOTIFY_OK;
@@ -175,7 +196,9 @@ static struct notifier_block kvm_clock_notifier = {
 int kvm_arch_hardware_setup(void)
 {
        gmap_notifier.notifier_call = kvm_gmap_notifier;
-       gmap_register_ipte_notifier(&gmap_notifier);
+       gmap_register_pte_notifier(&gmap_notifier);
+       vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+       gmap_register_pte_notifier(&vsie_gmap_notifier);
        atomic_notifier_chain_register(&s390_epoch_delta_notifier,
                                       &kvm_clock_notifier);
        return 0;
@@ -183,11 +206,109 @@ int kvm_arch_hardware_setup(void)
 
 void kvm_arch_hardware_unsetup(void)
 {
-       gmap_unregister_ipte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&vsie_gmap_notifier);
        atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
                                         &kvm_clock_notifier);
 }
 
+static void allow_cpu_feat(unsigned long nr)
+{
+       set_bit_inv(nr, kvm_s390_available_cpu_feat);
+}
+
+static inline int plo_test_bit(unsigned char nr)
+{
+       register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+       int cc = 3; /* subfunction not available */
+
+       asm volatile(
+               /* Parameter registers are ignored for "test bit" */
+               "       plo     0,0,0,0(0)\n"
+               "       ipm     %0\n"
+               "       srl     %0,28\n"
+               : "=d" (cc)
+               : "d" (r0)
+               : "cc");
+       return cc == 0;
+}
+
+static void kvm_s390_cpu_feat_init(void)
+{
+       int i;
+
+       for (i = 0; i < 256; ++i) {
+               if (plo_test_bit(i))
+                       kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+       }
+
+       if (test_facility(28)) /* TOD-clock steering */
+               ptff(kvm_s390_available_subfunc.ptff,
+                    sizeof(kvm_s390_available_subfunc.ptff),
+                    PTFF_QAF);
+
+       if (test_facility(17)) { /* MSA */
+               __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+               __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+               __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+               __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+               __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+       }
+       if (test_facility(76)) /* MSA3 */
+               __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+       if (test_facility(77)) { /* MSA4 */
+               __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+               __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+               __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+               __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+       }
+       if (test_facility(57)) /* MSA5 */
+               __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+
+       if (MACHINE_HAS_ESOP)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+       /*
+        * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+        * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+        */
+       if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+           !test_facility(3) || !nested)
+               return;
+       allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+       if (sclp.has_64bscao)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+       if (sclp.has_siif)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+       if (sclp.has_gpere)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+       if (sclp.has_gsls)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+       if (sclp.has_ib)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+       if (sclp.has_cei)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+       if (sclp.has_ibs)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+       /*
+        * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+        * all skey handling functions read/set the skey from the PGSTE
+        * instead of the real storage key.
+        *
+        * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+        * pages being detected as preserved although they are resident.
+        *
+        * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+        * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+        *
+        * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+        * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+        * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+        *
+        * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+        * cannot easily shadow the SCA because of the ipte lock.
+        */
+}
+
 int kvm_arch_init(void *opaque)
 {
        kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
@@ -199,6 +320,8 @@ int kvm_arch_init(void *opaque)
                return -ENOMEM;
        }
 
+       kvm_s390_cpu_feat_init();
+
        /* Register floating interrupt controller interface. */
        return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
 }
@@ -244,6 +367,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_USER_STSI:
        case KVM_CAP_S390_SKEYS:
        case KVM_CAP_S390_IRQ_STATE:
+       case KVM_CAP_S390_USER_INSTR0:
                r = 1;
                break;
        case KVM_CAP_S390_MEM_OP:
@@ -251,8 +375,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                break;
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
-               r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
-                                 : KVM_S390_BSCA_CPU_SLOTS;
+               r = KVM_S390_BSCA_CPU_SLOTS;
+               if (sclp.has_esca && sclp.has_64bscao)
+                       r = KVM_S390_ESCA_CPU_SLOTS;
                break;
        case KVM_CAP_NR_MEMSLOTS:
                r = KVM_USER_MEM_SLOTS;
@@ -335,6 +460,16 @@ out:
        return r;
 }
 
+static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+{
+       unsigned int i;
+       struct kvm_vcpu *vcpu;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+       }
+}
+
 static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
        int r;
@@ -355,7 +490,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
                break;
        case KVM_CAP_S390_VECTOR_REGISTERS:
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                        r = -EBUSY;
                } else if (MACHINE_HAS_VX) {
                        set_kvm_facility(kvm->arch.model.fac_mask, 129);
@@ -370,7 +505,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
        case KVM_CAP_S390_RI:
                r = -EINVAL;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                        r = -EBUSY;
                } else if (test_facility(64)) {
                        set_kvm_facility(kvm->arch.model.fac_mask, 64);
@@ -386,6 +521,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
                kvm->arch.user_stsi = 1;
                r = 0;
                break;
+       case KVM_CAP_S390_USER_INSTR0:
+               VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+               kvm->arch.user_instr0 = 1;
+               icpt_operexc_on_all_vcpus(kvm);
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -418,21 +559,23 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
        unsigned int idx;
        switch (attr->attr) {
        case KVM_S390_VM_MEM_ENABLE_CMMA:
-               /* enable CMMA only for z10 and later (EDAT_1) */
-               ret = -EINVAL;
-               if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+               ret = -ENXIO;
+               if (!sclp.has_cmma)
                        break;
 
                ret = -EBUSY;
                VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) == 0) {
+               if (!kvm->created_vcpus) {
                        kvm->arch.use_cmma = 1;
                        ret = 0;
                }
                mutex_unlock(&kvm->lock);
                break;
        case KVM_S390_VM_MEM_CLR_CMMA:
+               ret = -ENXIO;
+               if (!sclp.has_cmma)
+                       break;
                ret = -EINVAL;
                if (!kvm->arch.use_cmma)
                        break;
@@ -461,20 +604,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
                if (!new_limit)
                        return -EINVAL;
 
-               /* gmap_alloc takes last usable address */
+               /* gmap_create takes last usable address */
                if (new_limit != KVM_S390_NO_MEM_LIMIT)
                        new_limit -= 1;
 
                ret = -EBUSY;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) == 0) {
-                       /* gmap_alloc will round the limit up */
-                       struct gmap *new = gmap_alloc(current->mm, new_limit);
+               if (!kvm->created_vcpus) {
+                       /* gmap_create will round the limit up */
+                       struct gmap *new = gmap_create(current->mm, new_limit);
 
                        if (!new) {
                                ret = -ENOMEM;
                        } else {
-                               gmap_free(kvm->arch.gmap);
+                               gmap_remove(kvm->arch.gmap);
                                new->private = kvm;
                                kvm->arch.gmap = new;
                                ret = 0;
@@ -644,7 +787,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
        int ret = 0;
 
        mutex_lock(&kvm->lock);
-       if (atomic_read(&kvm->online_vcpus)) {
+       if (kvm->created_vcpus) {
                ret = -EBUSY;
                goto out;
        }
@@ -676,6 +819,39 @@ out:
        return ret;
 }
 
+static int kvm_s390_set_processor_feat(struct kvm *kvm,
+                                      struct kvm_device_attr *attr)
+{
+       struct kvm_s390_vm_cpu_feat data;
+       int ret = -EBUSY;
+
+       if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+               return -EFAULT;
+       if (!bitmap_subset((unsigned long *) data.feat,
+                          kvm_s390_available_cpu_feat,
+                          KVM_S390_VM_CPU_FEAT_NR_BITS))
+               return -EINVAL;
+
+       mutex_lock(&kvm->lock);
+       if (!atomic_read(&kvm->online_vcpus)) {
+               bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+                           KVM_S390_VM_CPU_FEAT_NR_BITS);
+               ret = 0;
+       }
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+                                         struct kvm_device_attr *attr)
+{
+       /*
+        * Once supported by kernel + hw, we have to store the subfunctions
+        * in kvm->arch and remember that user space configured them.
+        */
+       return -ENXIO;
+}
+
 static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 {
        int ret = -ENXIO;
@@ -684,6 +860,12 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
        case KVM_S390_VM_CPU_PROCESSOR:
                ret = kvm_s390_set_processor(kvm, attr);
                break;
+       case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               ret = kvm_s390_set_processor_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+               ret = kvm_s390_set_processor_subfunc(kvm, attr);
+               break;
        }
        return ret;
 }
@@ -732,6 +914,50 @@ out:
        return ret;
 }
 
+static int kvm_s390_get_processor_feat(struct kvm *kvm,
+                                      struct kvm_device_attr *attr)
+{
+       struct kvm_s390_vm_cpu_feat data;
+
+       bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
+       if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+               return -EFAULT;
+       return 0;
+}
+
+static int kvm_s390_get_machine_feat(struct kvm *kvm,
+                                    struct kvm_device_attr *attr)
+{
+       struct kvm_s390_vm_cpu_feat data;
+
+       bitmap_copy((unsigned long *) data.feat,
+                   kvm_s390_available_cpu_feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
+       if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+               return -EFAULT;
+       return 0;
+}
+
+static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+                                         struct kvm_device_attr *attr)
+{
+       /*
+        * Once we can actually configure subfunctions (kernel + hw support),
+        * we have to check if they were already set by user space, if so copy
+        * them from kvm->arch.
+        */
+       return -ENXIO;
+}
+
+static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+                                       struct kvm_device_attr *attr)
+{
+       if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+           sizeof(struct kvm_s390_vm_cpu_subfunc)))
+               return -EFAULT;
+       return 0;
+}
 static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 {
        int ret = -ENXIO;
@@ -743,6 +969,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
        case KVM_S390_VM_CPU_MACHINE:
                ret = kvm_s390_get_machine(kvm, attr);
                break;
+       case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               ret = kvm_s390_get_processor_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_MACHINE_FEAT:
+               ret = kvm_s390_get_machine_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+               ret = kvm_s390_get_processor_subfunc(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+               ret = kvm_s390_get_machine_subfunc(kvm, attr);
+               break;
        }
        return ret;
 }
@@ -803,6 +1041,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
                switch (attr->attr) {
                case KVM_S390_VM_MEM_ENABLE_CMMA:
                case KVM_S390_VM_MEM_CLR_CMMA:
+                       ret = sclp.has_cmma ? 0 : -ENXIO;
+                       break;
                case KVM_S390_VM_MEM_LIMIT_SIZE:
                        ret = 0;
                        break;
@@ -826,8 +1066,13 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
                switch (attr->attr) {
                case KVM_S390_VM_CPU_PROCESSOR:
                case KVM_S390_VM_CPU_MACHINE:
+               case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               case KVM_S390_VM_CPU_MACHINE_FEAT:
+               case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
                        ret = 0;
                        break;
+               /* configuring subfunctions is not supported yet */
+               case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
                default:
                        ret = -ENXIO;
                        break;
@@ -858,7 +1103,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
        uint8_t *keys;
        uint64_t hva;
-       unsigned long curkey;
        int i, r = 0;
 
        if (args->flags != 0)
@@ -879,26 +1123,27 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
        if (!keys)
                return -ENOMEM;
 
+       down_read(&current->mm->mmap_sem);
        for (i = 0; i < args->count; i++) {
                hva = gfn_to_hva(kvm, args->start_gfn + i);
                if (kvm_is_error_hva(hva)) {
                        r = -EFAULT;
-                       goto out;
+                       break;
                }
 
-               curkey = get_guest_storage_key(current->mm, hva);
-               if (IS_ERR_VALUE(curkey)) {
-                       r = curkey;
-                       goto out;
-               }
-               keys[i] = curkey;
+               r = get_guest_storage_key(current->mm, hva, &keys[i]);
+               if (r)
+                       break;
+       }
+       up_read(&current->mm->mmap_sem);
+
+       if (!r) {
+               r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+                                sizeof(uint8_t) * args->count);
+               if (r)
+                       r = -EFAULT;
        }
 
-       r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
-                        sizeof(uint8_t) * args->count);
-       if (r)
-               r = -EFAULT;
-out:
        kvfree(keys);
        return r;
 }
@@ -935,24 +1180,25 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
        if (r)
                goto out;
 
+       down_read(&current->mm->mmap_sem);
        for (i = 0; i < args->count; i++) {
                hva = gfn_to_hva(kvm, args->start_gfn + i);
                if (kvm_is_error_hva(hva)) {
                        r = -EFAULT;
-                       goto out;
+                       break;
                }
 
                /* Lowest order bit is reserved */
                if (keys[i] & 0x01) {
                        r = -EINVAL;
-                       goto out;
+                       break;
                }
 
-               r = set_guest_storage_key(current->mm, hva,
-                                         (unsigned long)keys[i], 0);
+               r = set_guest_storage_key(current->mm, hva, keys[i], 0);
                if (r)
-                       goto out;
+                       break;
        }
+       up_read(&current->mm->mmap_sem);
 out:
        kvfree(keys);
        return r;
@@ -1129,6 +1375,7 @@ static void sca_dispose(struct kvm *kvm)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+       gfp_t alloc_flags = GFP_KERNEL;
        int i, rc;
        char debug_name[16];
        static unsigned long sca_offset;
@@ -1150,9 +1397,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        rc = -ENOMEM;
 
+       ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
+
        kvm->arch.use_esca = 0; /* start with basic SCA */
+       if (!sclp.has_64bscao)
+               alloc_flags |= GFP_DMA;
        rwlock_init(&kvm->arch.sca_lock);
-       kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+       kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
        if (!kvm->arch.sca)
                goto out_err;
        spin_lock(&kvm_lock);
@@ -1189,6 +1440,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
               S390_ARCH_FAC_LIST_SIZE_BYTE);
 
+       set_kvm_facility(kvm->arch.model.fac_mask, 74);
+       set_kvm_facility(kvm->arch.model.fac_list, 74);
+
        kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
        kvm->arch.model.ibc = sclp.ibc & 0x0fff;
 
@@ -1212,7 +1466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
                else
                        kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
                                                    sclp.hamax + 1);
-               kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+               kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
                if (!kvm->arch.gmap)
                        goto out_err;
                kvm->arch.gmap->private = kvm;
@@ -1224,6 +1478,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.epoch = 0;
 
        spin_lock_init(&kvm->arch.start_stop_lock);
+       kvm_s390_vsie_init(kvm);
        KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
        return 0;
@@ -1245,7 +1500,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
                sca_del_vcpu(vcpu);
 
        if (kvm_is_ucontrol(vcpu->kvm))
-               gmap_free(vcpu->arch.gmap);
+               gmap_remove(vcpu->arch.gmap);
 
        if (vcpu->kvm->arch.use_cmma)
                kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1278,16 +1533,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        debug_unregister(kvm->arch.dbf);
        free_page((unsigned long)kvm->arch.sie_page2);
        if (!kvm_is_ucontrol(kvm))
-               gmap_free(kvm->arch.gmap);
+               gmap_remove(kvm->arch.gmap);
        kvm_s390_destroy_adapters(kvm);
        kvm_s390_clear_float_irqs(kvm);
+       kvm_s390_vsie_destroy(kvm);
        KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
 /* Section: vcpu related */
 static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+       vcpu->arch.gmap = gmap_create(current->mm, -1UL);
        if (!vcpu->arch.gmap)
                return -ENOMEM;
        vcpu->arch.gmap->private = vcpu->kvm;
@@ -1396,7 +1652,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
 
        if (id < KVM_S390_BSCA_CPU_SLOTS)
                return true;
-       if (!sclp.has_esca)
+       if (!sclp.has_esca || !sclp.has_64bscao)
                return false;
 
        mutex_lock(&kvm->lock);
@@ -1537,7 +1793,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        save_access_regs(vcpu->arch.host_acrs);
        restore_access_regs(vcpu->run->s.regs.acrs);
-       gmap_enable(vcpu->arch.gmap);
+       gmap_enable(vcpu->arch.enabled_gmap);
        atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __start_cpu_timer_accounting(vcpu);
@@ -1550,7 +1806,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __stop_cpu_timer_accounting(vcpu);
        atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
-       gmap_disable(vcpu->arch.gmap);
+       vcpu->arch.enabled_gmap = gmap_get_enabled();
+       gmap_disable(vcpu->arch.enabled_gmap);
 
        /* Save guest register state */
        save_fpu_regs();
@@ -1599,7 +1856,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
                vcpu->arch.gmap = vcpu->kvm->arch.gmap;
                sca_add_vcpu(vcpu);
        }
-
+       if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+               vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+       /* make vcpu_load load the right gmap on the first trigger */
+       vcpu->arch.enabled_gmap = vcpu->arch.gmap;
 }
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1658,15 +1918,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
        kvm_s390_vcpu_setup_model(vcpu);
 
-       vcpu->arch.sie_block->ecb = 0x02;
+       /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+       if (MACHINE_HAS_ESOP)
+               vcpu->arch.sie_block->ecb |= 0x02;
        if (test_kvm_facility(vcpu->kvm, 9))
                vcpu->arch.sie_block->ecb |= 0x04;
-       if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+       if (test_kvm_facility(vcpu->kvm, 73))
                vcpu->arch.sie_block->ecb |= 0x10;
 
-       if (test_kvm_facility(vcpu->kvm, 8))
+       if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
                vcpu->arch.sie_block->ecb2 |= 0x08;
-       vcpu->arch.sie_block->eca   = 0xC1002000U;
+       vcpu->arch.sie_block->eca = 0x1002000U;
+       if (sclp.has_cei)
+               vcpu->arch.sie_block->eca |= 0x80000000U;
+       if (sclp.has_ib)
+               vcpu->arch.sie_block->eca |= 0x40000000U;
        if (sclp.has_siif)
                vcpu->arch.sie_block->eca |= 1;
        if (sclp.has_sigpif)
@@ -1716,6 +1982,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        vcpu->arch.sie_block = &sie_page->sie_block;
        vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
 
+       /* the real guest size will always be smaller than msl */
+       vcpu->arch.sie_block->mso = 0;
+       vcpu->arch.sie_block->msl = sclp.hamax;
+
        vcpu->arch.sie_block->icpua = id;
        spin_lock_init(&vcpu->arch.local_int.lock);
        vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -1784,16 +2054,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
        kvm_s390_vcpu_request(vcpu);
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end)
 {
-       int i;
        struct kvm *kvm = gmap->private;
        struct kvm_vcpu *vcpu;
+       unsigned long prefix;
+       int i;
 
+       if (gmap_is_shadow(gmap))
+               return;
+       if (start >= 1UL << 31)
+               /* We are only interested in prefix pages */
+               return;
        kvm_for_each_vcpu(i, vcpu, kvm) {
                /* match against both prefix pages */
-               if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
-                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+               prefix = kvm_s390_get_prefix(vcpu);
+               if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+                                  start, end);
                        kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
                }
        }
@@ -2002,6 +2281,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
        if (dbg->control & ~VALID_GUESTDBG_FLAGS)
                return -EINVAL;
+       if (!sclp.has_gpere)
+               return -EINVAL;
 
        if (dbg->control & KVM_GUESTDBG_ENABLE) {
                vcpu->guest_debug = dbg->control;
@@ -2070,16 +2351,16 @@ retry:
                return 0;
        /*
         * We use MMU_RELOAD just to re-arm the ipte notifier for the
-        * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+        * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
         * This ensures that the ipte instruction for this request has
         * already finished. We might race against a second unmapper that
         * wants to set the blocking bit. Lets just retry the request loop.
         */
        if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
                int rc;
-               rc = gmap_ipte_notify(vcpu->arch.gmap,
-                                     kvm_s390_get_prefix(vcpu),
-                                     PAGE_SIZE * 2);
+               rc = gmap_mprotect_notify(vcpu->arch.gmap,
+                                         kvm_s390_get_prefix(vcpu),
+                                         PAGE_SIZE * 2, PROT_WRITE);
                if (rc)
                        return rc;
                goto retry;
@@ -2108,6 +2389,11 @@ retry:
                goto retry;
        }
 
+       if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+               vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+               goto retry;
+       }
+
        /* nothing to do, just clear the request */
        clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 
@@ -2362,14 +2648,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                 * guest_enter and guest_exit should be no uaccess.
                 */
                local_irq_disable();
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                __disable_cpu_timer_accounting(vcpu);
                local_irq_enable();
                exit_reason = sie64a(vcpu->arch.sie_block,
                                     vcpu->run->s.regs.gprs);
                local_irq_disable();
                __enable_cpu_timer_accounting(vcpu);
-               __kvm_guest_exit();
+               guest_exit_irqoff();
                local_irq_enable();
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
@@ -2598,6 +2884,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
 
 static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 {
+       if (!sclp.has_ibs)
+               return;
        kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
        kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
 }
index 8621ab0..b843286 100644 (file)
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-       return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+       return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
        return 0;
 }
 
+static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr)
+{
+       WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS);
+       return test_bit_inv(nr, kvm->arch.cpu_feat);
+}
+
 /* are cpu states controlled by user space */
 static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
 {
@@ -232,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
 }
 static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
 {
+       /* don't inject PER events if we re-execute the instruction */
+       vcpu->arch.sie_block->icptstatus &= ~0x02;
        kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
 }
 
@@ -246,10 +254,21 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+                                unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 
+/* implemented in sthyi.c */
+int handle_sthyi(struct kvm_vcpu *vcpu);
+
 /* implemented in kvm-s390.c */
 void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
@@ -360,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
                            struct kvm_guest_debug *dbg);
 void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
 void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
 void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
 
 /* support for Basic/Extended SCA handling */
index 95916fa..4616038 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/io.h>
 #include <asm/ptrace.h>
 #include <asm/compat.h>
+#include <asm/sclp.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
 #include "trace.h"
@@ -152,30 +153,166 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
        int rc = 0;
+
+       trace_kvm_s390_skey_related_inst(vcpu);
        if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
                return rc;
 
        rc = s390_enable_skey();
-       VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest");
-       trace_kvm_s390_skey_related_inst(vcpu);
-       vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+       VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
+       if (!rc)
+               vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
        return rc;
 }
 
-
-static int handle_skey(struct kvm_vcpu *vcpu)
+static int try_handle_skey(struct kvm_vcpu *vcpu)
 {
-       int rc = __skey_check_enable(vcpu);
+       int rc;
 
+       vcpu->stat.instruction_storage_key++;
+       rc = __skey_check_enable(vcpu);
        if (rc)
                return rc;
-       vcpu->stat.instruction_storage_key++;
-
+       if (sclp.has_skey) {
+               /* with storage-key facility, SIE interprets it for us */
+               kvm_s390_retry_instr(vcpu);
+               VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+               return -EAGAIN;
+       }
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+       return 0;
+}
 
-       kvm_s390_retry_instr(vcpu);
-       VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+static int handle_iske(struct kvm_vcpu *vcpu)
+{
+       unsigned long addr;
+       unsigned char key;
+       int reg1, reg2;
+       int rc;
+
+       rc = try_handle_skey(vcpu);
+       if (rc)
+               return rc != -EAGAIN ? rc : 0;
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+       addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       addr = kvm_s390_logical_to_effective(vcpu, addr);
+       addr = kvm_s390_real_to_abs(vcpu, addr);
+       addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+       if (kvm_is_error_hva(addr))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       down_read(&current->mm->mmap_sem);
+       rc = get_guest_storage_key(current->mm, addr, &key);
+       up_read(&current->mm->mmap_sem);
+       if (rc)
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       vcpu->run->s.regs.gprs[reg1] &= ~0xff;
+       vcpu->run->s.regs.gprs[reg1] |= key;
+       return 0;
+}
+
+static int handle_rrbe(struct kvm_vcpu *vcpu)
+{
+       unsigned long addr;
+       int reg1, reg2;
+       int rc;
+
+       rc = try_handle_skey(vcpu);
+       if (rc)
+               return rc != -EAGAIN ? rc : 0;
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+       addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       addr = kvm_s390_logical_to_effective(vcpu, addr);
+       addr = kvm_s390_real_to_abs(vcpu, addr);
+       addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+       if (kvm_is_error_hva(addr))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       down_read(&current->mm->mmap_sem);
+       rc = reset_guest_reference_bit(current->mm, addr);
+       up_read(&current->mm->mmap_sem);
+       if (rc < 0)
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       kvm_s390_set_psw_cc(vcpu, rc);
+       return 0;
+}
+
+#define SSKE_NQ 0x8
+#define SSKE_MR 0x4
+#define SSKE_MC 0x2
+#define SSKE_MB 0x1
+static int handle_sske(struct kvm_vcpu *vcpu)
+{
+       unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
+       unsigned long start, end;
+       unsigned char key, oldkey;
+       int reg1, reg2;
+       int rc;
+
+       rc = try_handle_skey(vcpu);
+       if (rc)
+               return rc != -EAGAIN ? rc : 0;
+
+       if (!test_kvm_facility(vcpu->kvm, 8))
+               m3 &= ~SSKE_MB;
+       if (!test_kvm_facility(vcpu->kvm, 10))
+               m3 &= ~(SSKE_MC | SSKE_MR);
+       if (!test_kvm_facility(vcpu->kvm, 14))
+               m3 &= ~SSKE_NQ;
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+       key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+       start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       start = kvm_s390_logical_to_effective(vcpu, start);
+       if (m3 & SSKE_MB) {
+               /* start already designates an absolute address */
+               end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
+       } else {
+               start = kvm_s390_real_to_abs(vcpu, start);
+               end = start + PAGE_SIZE;
+       }
+
+       while (start != end) {
+               unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+
+               if (kvm_is_error_hva(addr))
+                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+               down_read(&current->mm->mmap_sem);
+               rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
+                                               m3 & SSKE_NQ, m3 & SSKE_MR,
+                                               m3 & SSKE_MC);
+               up_read(&current->mm->mmap_sem);
+               if (rc < 0)
+                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+               start += PAGE_SIZE;
+       };
+
+       if (m3 & (SSKE_MC | SSKE_MR)) {
+               if (m3 & SSKE_MB) {
+                       /* skey in reg1 is unpredictable */
+                       kvm_s390_set_psw_cc(vcpu, 3);
+               } else {
+                       kvm_s390_set_psw_cc(vcpu, rc);
+                       vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
+                       vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+               }
+       }
+       if (m3 & SSKE_MB) {
+               if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT)
+                       vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK;
+               else
+                       vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL;
+               end = kvm_s390_logical_to_effective(vcpu, end);
+               vcpu->run->s.regs.gprs[reg2] |= end;
+       }
        return 0;
 }
 
@@ -582,10 +719,11 @@ static const intercept_handler_t b2_handlers[256] = {
        [0x10] = handle_set_prefix,
        [0x11] = handle_store_prefix,
        [0x12] = handle_store_cpu_address,
+       [0x14] = kvm_s390_handle_vsie,
        [0x21] = handle_ipte_interlock,
-       [0x29] = handle_skey,
-       [0x2a] = handle_skey,
-       [0x2b] = handle_skey,
+       [0x29] = handle_iske,
+       [0x2a] = handle_rrbe,
+       [0x2b] = handle_sske,
        [0x2c] = handle_test_block,
        [0x30] = handle_io_inst,
        [0x31] = handle_io_inst,
@@ -654,8 +792,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
 
 static int handle_pfmf(struct kvm_vcpu *vcpu)
 {
+       bool mr = false, mc = false, nq;
        int reg1, reg2;
        unsigned long start, end;
+       unsigned char key;
 
        vcpu->stat.instruction_pfmf++;
 
@@ -675,15 +815,27 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
            !test_kvm_facility(vcpu->kvm, 14))
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       /* No support for conditional-SSKE */
-       if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
-               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       /* Only provide conditional-SSKE support if enabled for the guest */
+       if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK &&
+           test_kvm_facility(vcpu->kvm, 10)) {
+               mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR;
+               mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC;
+       }
 
+       nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
+       key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
        start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
        start = kvm_s390_logical_to_effective(vcpu, start);
 
+       if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+               if (kvm_s390_check_low_addr_prot_real(vcpu, start))
+                       return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+       }
+
        switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
        case 0x00000000:
+               /* only 4k frames specify a real address */
+               start = kvm_s390_real_to_abs(vcpu, start);
                end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
                break;
        case 0x00001000:
@@ -701,20 +853,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
        }
 
-       if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
-               if (kvm_s390_check_low_addr_prot_real(vcpu, start))
-                       return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
-       }
-
-       while (start < end) {
-               unsigned long useraddr, abs_addr;
+       while (start != end) {
+               unsigned long useraddr;
 
                /* Translate guest address to host address */
-               if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
-                       abs_addr = kvm_s390_real_to_abs(vcpu, start);
-               else
-                       abs_addr = start;
-               useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+               useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
                if (kvm_is_error_hva(useraddr))
                        return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
@@ -728,16 +871,25 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 
                        if (rc)
                                return rc;
-                       if (set_guest_storage_key(current->mm, useraddr,
-                                       vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
-                                       vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
+                       down_read(&current->mm->mmap_sem);
+                       rc = cond_set_guest_storage_key(current->mm, useraddr,
+                                                       key, NULL, nq, mr, mc);
+                       up_read(&current->mm->mmap_sem);
+                       if (rc < 0)
                                return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                }
 
                start += PAGE_SIZE;
        }
-       if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
-               vcpu->run->s.regs.gprs[reg2] = end;
+       if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
+               if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) {
+                       vcpu->run->s.regs.gprs[reg2] = end;
+               } else {
+                       vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL;
+                       end = kvm_s390_logical_to_effective(vcpu, end);
+                       vcpu->run->s.regs.gprs[reg2] |= end;
+               }
+       }
        return 0;
 }
 
@@ -1033,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int handle_ptff(struct kvm_vcpu *vcpu)
+{
+       /* we don't emulate any control instructions yet */
+       kvm_s390_set_psw_cc(vcpu, 3);
+       return 0;
+}
+
 static const intercept_handler_t x01_handlers[256] = {
+       [0x04] = handle_ptff,
        [0x07] = handle_sckpf,
 };
 
index 28ea0ca..1a252f5 100644 (file)
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
        const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
        u16 p_asn, s_asn;
        psw_t *psw;
-       u32 flags;
+       bool idle;
 
-       flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+       idle = is_vcpu_idle(vcpu);
        psw = &dst_vcpu->arch.sie_block->gpsw;
        p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff;  /* Primary ASN */
        s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff;  /* Secondary ASN */
 
        /* Inject the emergency signal? */
-       if (!(flags & CPUSTAT_STOPPED)
+       if (!is_vcpu_stopped(vcpu)
            || (psw->mask & psw_int_mask) != psw_int_mask
-           || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
-           || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+           || (idle && psw->addr != 0)
+           || (!idle && (asn == p_asn || asn == s_asn))) {
                return __inject_sigp_emergency(vcpu, dst_vcpu);
        } else {
                *reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
new file mode 100644 (file)
index 0000000..bd98b7d
--- /dev/null
@@ -0,0 +1,471 @@
+/*
+ * store hypervisor information instruction emulation functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include <asm/kvm_host.h>
+#include <asm/asm-offsets.h>
+#include <asm/sclp.h>
+#include <asm/diag.h>
+#include <asm/sysinfo.h>
+#include <asm/ebcdic.h>
+
+#include "kvm-s390.h"
+#include "gaccess.h"
+#include "trace.h"
+
+#define DED_WEIGHT 0xffff
+/*
+ * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string
+ * as they are justified with spaces.
+ */
+#define CP  0xc3d7404040404040UL
+#define IFL 0xc9c6d34040404040UL
+
+enum hdr_flags {
+       HDR_NOT_LPAR   = 0x10,
+       HDR_STACK_INCM = 0x20,
+       HDR_STSI_UNAV  = 0x40,
+       HDR_PERF_UNAV  = 0x80,
+};
+
+enum mac_validity {
+       MAC_NAME_VLD = 0x20,
+       MAC_ID_VLD   = 0x40,
+       MAC_CNT_VLD  = 0x80,
+};
+
+enum par_flag {
+       PAR_MT_EN = 0x80,
+};
+
+enum par_validity {
+       PAR_GRP_VLD  = 0x08,
+       PAR_ID_VLD   = 0x10,
+       PAR_ABS_VLD  = 0x20,
+       PAR_WGHT_VLD = 0x40,
+       PAR_PCNT_VLD  = 0x80,
+};
+
+struct hdr_sctn {
+       u8 infhflg1;
+       u8 infhflg2; /* reserved */
+       u8 infhval1; /* reserved */
+       u8 infhval2; /* reserved */
+       u8 reserved[3];
+       u8 infhygct;
+       u16 infhtotl;
+       u16 infhdln;
+       u16 infmoff;
+       u16 infmlen;
+       u16 infpoff;
+       u16 infplen;
+       u16 infhoff1;
+       u16 infhlen1;
+       u16 infgoff1;
+       u16 infglen1;
+       u16 infhoff2;
+       u16 infhlen2;
+       u16 infgoff2;
+       u16 infglen2;
+       u16 infhoff3;
+       u16 infhlen3;
+       u16 infgoff3;
+       u16 infglen3;
+       u8 reserved2[4];
+} __packed;
+
+struct mac_sctn {
+       u8 infmflg1; /* reserved */
+       u8 infmflg2; /* reserved */
+       u8 infmval1;
+       u8 infmval2; /* reserved */
+       u16 infmscps;
+       u16 infmdcps;
+       u16 infmsifl;
+       u16 infmdifl;
+       char infmname[8];
+       char infmtype[4];
+       char infmmanu[16];
+       char infmseq[16];
+       char infmpman[4];
+       u8 reserved[4];
+} __packed;
+
+struct par_sctn {
+       u8 infpflg1;
+       u8 infpflg2; /* reserved */
+       u8 infpval1;
+       u8 infpval2; /* reserved */
+       u16 infppnum;
+       u16 infpscps;
+       u16 infpdcps;
+       u16 infpsifl;
+       u16 infpdifl;
+       u16 reserved;
+       char infppnam[8];
+       u32 infpwbcp;
+       u32 infpabcp;
+       u32 infpwbif;
+       u32 infpabif;
+       char infplgnm[8];
+       u32 infplgcp;
+       u32 infplgif;
+} __packed;
+
+struct sthyi_sctns {
+       struct hdr_sctn hdr;
+       struct mac_sctn mac;
+       struct par_sctn par;
+} __packed;
+
+struct cpu_inf {
+       u64 lpar_cap;
+       u64 lpar_grp_cap;
+       u64 lpar_weight;
+       u64 all_weight;
+       int cpu_num_ded;
+       int cpu_num_shd;
+};
+
+struct lpar_cpu_inf {
+       struct cpu_inf cp;
+       struct cpu_inf ifl;
+};
+
+static inline u64 cpu_id(u8 ctidx, void *diag224_buf)
+{
+       return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN));
+}
+
+/*
+ * Scales the cpu capping from the lpar range to the one expected in
+ * sthyi data.
+ *
+ * diag204 reports a cap in hundredths of processor units.
+ * z/VM's range for one core is 0 - 0x10000.
+ */
+static u32 scale_cap(u32 in)
+{
+       return (0x10000 * in) / 100;
+}
+
+static void fill_hdr(struct sthyi_sctns *sctns)
+{
+       sctns->hdr.infhdln = sizeof(sctns->hdr);
+       sctns->hdr.infmoff = sizeof(sctns->hdr);
+       sctns->hdr.infmlen = sizeof(sctns->mac);
+       sctns->hdr.infplen = sizeof(sctns->par);
+       sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen;
+       sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen;
+}
+
+static void fill_stsi_mac(struct sthyi_sctns *sctns,
+                         struct sysinfo_1_1_1 *sysinfo)
+{
+       if (stsi(sysinfo, 1, 1, 1))
+               return;
+
+       sclp_ocf_cpc_name_copy(sctns->mac.infmname);
+
+       memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype));
+       memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu));
+       memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman));
+       memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq));
+
+       sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD;
+}
+
+static void fill_stsi_par(struct sthyi_sctns *sctns,
+                         struct sysinfo_2_2_2 *sysinfo)
+{
+       if (stsi(sysinfo, 2, 2, 2))
+               return;
+
+       sctns->par.infppnum = sysinfo->lpar_number;
+       memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam));
+
+       sctns->par.infpval1 |= PAR_ID_VLD;
+}
+
+static void fill_stsi(struct sthyi_sctns *sctns)
+{
+       void *sysinfo;
+
+       /* Errors are handled through the validity bits in the response. */
+       sysinfo = (void *)__get_free_page(GFP_KERNEL);
+       if (!sysinfo)
+               return;
+
+       fill_stsi_mac(sctns, sysinfo);
+       fill_stsi_par(sctns, sysinfo);
+
+       free_pages((unsigned long)sysinfo, 0);
+}
+
+static void fill_diag_mac(struct sthyi_sctns *sctns,
+                         struct diag204_x_phys_block *block,
+                         void *diag224_buf)
+{
+       int i;
+
+       for (i = 0; i < block->hdr.cpus; i++) {
+               switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+               case CP:
+                       if (block->cpus[i].weight == DED_WEIGHT)
+                               sctns->mac.infmdcps++;
+                       else
+                               sctns->mac.infmscps++;
+                       break;
+               case IFL:
+                       if (block->cpus[i].weight == DED_WEIGHT)
+                               sctns->mac.infmdifl++;
+                       else
+                               sctns->mac.infmsifl++;
+                       break;
+               }
+       }
+       sctns->mac.infmval1 |= MAC_CNT_VLD;
+}
+
+/* Returns a pointer to the the next partition block. */
+static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
+                                                bool this_lpar,
+                                                void *diag224_buf,
+                                                struct diag204_x_part_block *block)
+{
+       int i, capped = 0, weight_cp = 0, weight_ifl = 0;
+       struct cpu_inf *cpu_inf;
+
+       for (i = 0; i < block->hdr.rcpus; i++) {
+               if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE))
+                       continue;
+
+               switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+               case CP:
+                       cpu_inf = &part_inf->cp;
+                       if (block->cpus[i].cur_weight < DED_WEIGHT)
+                               weight_cp |= block->cpus[i].cur_weight;
+                       break;
+               case IFL:
+                       cpu_inf = &part_inf->ifl;
+                       if (block->cpus[i].cur_weight < DED_WEIGHT)
+                               weight_ifl |= block->cpus[i].cur_weight;
+                       break;
+               default:
+                       continue;
+               }
+
+               if (!this_lpar)
+                       continue;
+
+               capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED;
+               cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap;
+               cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap;
+
+               if (block->cpus[i].weight == DED_WEIGHT)
+                       cpu_inf->cpu_num_ded += 1;
+               else
+                       cpu_inf->cpu_num_shd += 1;
+       }
+
+       if (this_lpar && capped) {
+               part_inf->cp.lpar_weight = weight_cp;
+               part_inf->ifl.lpar_weight = weight_ifl;
+       }
+       part_inf->cp.all_weight += weight_cp;
+       part_inf->ifl.all_weight += weight_ifl;
+       return (struct diag204_x_part_block *)&block->cpus[i];
+}
+
+static void fill_diag(struct sthyi_sctns *sctns)
+{
+       int i, r, pages;
+       bool this_lpar;
+       void *diag204_buf;
+       void *diag224_buf = NULL;
+       struct diag204_x_info_blk_hdr *ti_hdr;
+       struct diag204_x_part_block *part_block;
+       struct diag204_x_phys_block *phys_block;
+       struct lpar_cpu_inf lpar_inf = {};
+
+       /* Errors are handled through the validity bits in the response. */
+       pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+                       (unsigned long)DIAG204_INFO_EXT, 0, NULL);
+       if (pages <= 0)
+               return;
+
+       diag204_buf = vmalloc(PAGE_SIZE * pages);
+       if (!diag204_buf)
+               return;
+
+       r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
+                   (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
+       if (r < 0)
+               goto out;
+
+       diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+       if (!diag224_buf || diag224(diag224_buf))
+               goto out;
+
+       ti_hdr = diag204_buf;
+       part_block = diag204_buf + sizeof(*ti_hdr);
+
+       for (i = 0; i < ti_hdr->npar; i++) {
+               /*
+                * For the calling lpar we also need to get the cpu
+                * caps and weights. The time information block header
+                * specifies the offset to the partition block of the
+                * caller lpar, so we know when we process its data.
+                */
+               this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part;
+               part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf,
+                                         part_block);
+       }
+
+       phys_block = (struct diag204_x_phys_block *)part_block;
+       part_block = diag204_buf + ti_hdr->this_part;
+       if (part_block->hdr.mtid)
+               sctns->par.infpflg1 = PAR_MT_EN;
+
+       sctns->par.infpval1 |= PAR_GRP_VLD;
+       sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap);
+       sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap);
+       memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name,
+              sizeof(sctns->par.infplgnm));
+
+       sctns->par.infpscps = lpar_inf.cp.cpu_num_shd;
+       sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded;
+       sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd;
+       sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded;
+       sctns->par.infpval1 |= PAR_PCNT_VLD;
+
+       sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap);
+       sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap);
+       sctns->par.infpval1 |= PAR_ABS_VLD;
+
+       /*
+        * Everything below needs global performance data to be
+        * meaningful.
+        */
+       if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) {
+               sctns->hdr.infhflg1 |= HDR_PERF_UNAV;
+               goto out;
+       }
+
+       fill_diag_mac(sctns, phys_block, diag224_buf);
+
+       if (lpar_inf.cp.lpar_weight) {
+               sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 *
+                       lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight;
+       }
+
+       if (lpar_inf.ifl.lpar_weight) {
+               sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 *
+                       lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight;
+       }
+       sctns->par.infpval1 |= PAR_WGHT_VLD;
+
+out:
+       kfree(diag224_buf);
+       vfree(diag204_buf);
+}
+
+static int sthyi(u64 vaddr)
+{
+       register u64 code asm("0") = 0;
+       register u64 addr asm("2") = vaddr;
+       int cc;
+
+       asm volatile(
+               ".insn   rre,0xB2560000,%[code],%[addr]\n"
+               "ipm     %[cc]\n"
+               "srl     %[cc],28\n"
+               : [cc] "=d" (cc)
+               : [code] "d" (code), [addr] "a" (addr)
+               : "memory", "cc");
+       return cc;
+}
+
+int handle_sthyi(struct kvm_vcpu *vcpu)
+{
+       int reg1, reg2, r = 0;
+       u64 code, addr, cc = 0;
+       struct sthyi_sctns *sctns = NULL;
+
+       /*
+        * STHYI requires extensive locking in the higher hypervisors
+        * and is very computational/memory expensive. Therefore we
+        * ratelimit the executions per VM.
+        */
+       if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) {
+               kvm_s390_retry_instr(vcpu);
+               return 0;
+       }
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+       code = vcpu->run->s.regs.gprs[reg1];
+       addr = vcpu->run->s.regs.gprs[reg2];
+
+       vcpu->stat.instruction_sthyi++;
+       VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
+       trace_kvm_s390_handle_sthyi(vcpu, code, addr);
+
+       if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       if (code & 0xffff) {
+               cc = 3;
+               goto out;
+       }
+
+       /*
+        * If the page has not yet been faulted in, we want to do that
+        * now and not after all the expensive calculations.
+        */
+       r = write_guest(vcpu, addr, reg2, &cc, 1);
+       if (r)
+               return kvm_s390_inject_prog_cond(vcpu, r);
+
+       sctns = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!sctns)
+               return -ENOMEM;
+
+       /*
+        * If we are a guest, we don't want to emulate an emulated
+        * instruction. We ask the hypervisor to provide the data.
+        */
+       if (test_facility(74)) {
+               cc = sthyi((u64)sctns);
+               goto out;
+       }
+
+       fill_hdr(sctns);
+       fill_stsi(sctns);
+       fill_diag(sctns);
+
+out:
+       if (!cc) {
+               r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+               if (r) {
+                       free_page((unsigned long)sctns);
+                       return kvm_s390_inject_prog_cond(vcpu, r);
+               }
+       }
+
+       free_page((unsigned long)sctns);
+       vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0;
+       kvm_s390_set_psw_cc(vcpu, cc);
+       return r;
+}
index 916834d..4fc9d4e 100644 (file)
@@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst,
            TP_fast_assign(
                    VCPU_ASSIGN_COMMON
                    ),
-           VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu")
+           VCPU_TP_PRINTK("%s", "storage key related instruction")
        );
 
 TRACE_EVENT(kvm_s390_major_guest_pfault,
@@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog,
                    __entry->code = code;
                    ),
 
-           VCPU_TP_PRINTK("intercepted program interruption %04x",
-                          __entry->code)
+           VCPU_TP_PRINTK("intercepted program interruption %04x (%s)",
+                          __entry->code,
+                          __print_symbolic(__entry->code,
+                                           icpt_prog_codes))
        );
 
 /*
@@ -412,6 +414,47 @@ TRACE_EVENT(kvm_s390_handle_stsi,
                           __entry->addr)
        );
 
+TRACE_EVENT(kvm_s390_handle_operexc,
+           TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+           TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(__u64, instruction)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->instruction = ((__u64)ipa << 48) |
+                   ((__u64)ipb << 16);
+                   ),
+
+           VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)",
+                          __entry->instruction,
+                          __print_symbolic(icpt_insn_decoder(__entry->instruction),
+                                           icpt_insn_codes))
+       );
+
+TRACE_EVENT(kvm_s390_handle_sthyi,
+           TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr),
+           TP_ARGS(VCPU_ARGS_COMMON, code, addr),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(u64, code)
+                   __field(u64, addr)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->code = code;
+                   __entry->addr = addr;
+                   ),
+
+           VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx",
+                          __entry->code, __entry->addr)
+       );
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644 (file)
index 0000000..c106488
--- /dev/null
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+       struct kvm_s390_sie_block scb_s;        /* 0x0000 */
+       /* the pinned originial scb */
+       struct kvm_s390_sie_block *scb_o;       /* 0x0200 */
+       /* the shadow gmap in use by the vsie_page */
+       struct gmap *gmap;                      /* 0x0208 */
+       /* address of the last reported fault to guest2 */
+       unsigned long fault_addr;               /* 0x0210 */
+       __u8 reserved[0x0700 - 0x0218];         /* 0x0218 */
+       struct kvm_s390_crypto_cb crycb;        /* 0x0700 */
+       __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+                            __u16 reason_code)
+{
+       scb->ipa = 0x1000;
+       scb->ipb = ((__u32) reason_code) << 16;
+       scb->icptcode = ICPT_VALIDITY;
+       return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+       atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+       prefix_unmapped(vsie_page);
+       if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+               atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+       while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+               cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+       atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+       return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+       const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+       int cpuflags;
+
+       cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+       atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+       atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags  */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+       /* we don't allow ESA/390 guests */
+       if (!(cpuflags & CPUSTAT_ZARCH))
+               return set_validity_icpt(scb_s, 0x0001U);
+
+       if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+               return set_validity_icpt(scb_s, 0x0001U);
+       else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+               return set_validity_icpt(scb_s, 0x0007U);
+
+       /* intervention requests will be set later */
+       newflags = CPUSTAT_ZARCH;
+       if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+               newflags |= CPUSTAT_GED;
+       if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+               if (cpuflags & CPUSTAT_GED)
+                       return set_validity_icpt(scb_s, 0x0001U);
+               newflags |= CPUSTAT_GED2;
+       }
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+               newflags |= cpuflags & CPUSTAT_P;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+               newflags |= cpuflags & CPUSTAT_SM;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+               newflags |= cpuflags & CPUSTAT_IBS;
+
+       atomic_set(&scb_s->cpuflags, newflags);
+       return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+       unsigned long *b1, *b2;
+       u8 ecb3_flags;
+
+       scb_s->crycbd = 0;
+       if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+               return 0;
+       /* format-1 is supported with message-security-assist extension 3 */
+       if (!test_kvm_facility(vcpu->kvm, 76))
+               return 0;
+       /* we may only allow it if enabled for guest 2 */
+       ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+                    (ECB3_AES | ECB3_DEA);
+       if (!ecb3_flags)
+               return 0;
+
+       if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+               return set_validity_icpt(scb_s, 0x003CU);
+       else if (!crycb_addr)
+               return set_validity_icpt(scb_s, 0x0039U);
+
+       /* copy only the wrapping keys */
+       if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+               return set_validity_icpt(scb_s, 0x0035U);
+
+       scb_s->ecb3 |= ecb3_flags;
+       scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+                       CRYCB_FORMAT2;
+
+       /* xor both blocks in one run */
+       b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+       b2 = (unsigned long *)
+                           vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+       /* as 56%8 == 0, bitmap_xor won't overwrite any data */
+       bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+       return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+       scb_s->ibc = 0;
+       /* ibc installed in g2 and requested for g3 */
+       if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+               scb_s->ibc = scb_o->ibc & 0x0fffU;
+               /* takte care of the minimum ibc level of the machine */
+               if (scb_s->ibc < min_ibc)
+                       scb_s->ibc = min_ibc;
+               /* take care of the maximum ibc level set for the guest */
+               if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+                       scb_s->ibc = vcpu->kvm->arch.model.ibc;
+       }
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+       /* interception */
+       scb_o->icptcode = scb_s->icptcode;
+       scb_o->icptstatus = scb_s->icptstatus;
+       scb_o->ipa = scb_s->ipa;
+       scb_o->ipb = scb_s->ipb;
+       scb_o->gbea = scb_s->gbea;
+
+       /* timer */
+       scb_o->cputm = scb_s->cputm;
+       scb_o->ckc = scb_s->ckc;
+       scb_o->todpr = scb_s->todpr;
+
+       /* guest state */
+       scb_o->gpsw = scb_s->gpsw;
+       scb_o->gg14 = scb_s->gg14;
+       scb_o->gg15 = scb_s->gg15;
+       memcpy(scb_o->gcr, scb_s->gcr, 128);
+       scb_o->pp = scb_s->pp;
+
+       /* interrupt intercept */
+       switch (scb_s->icptcode) {
+       case ICPT_PROGI:
+       case ICPT_INSTPROGI:
+       case ICPT_EXTINT:
+               memcpy((void *)((u64)scb_o + 0xc0),
+                      (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+               break;
+       case ICPT_PARTEXEC:
+               /* MVPG only */
+               memcpy((void *)((u64)scb_o + 0xc0),
+                      (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+               break;
+       }
+
+       if (scb_s->ihcpu != 0xffffU)
+               scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       bool had_tx = scb_s->ecb & 0x10U;
+       unsigned long new_mso = 0;
+       int rc;
+
+       /* make sure we don't have any leftovers when reusing the scb */
+       scb_s->icptcode = 0;
+       scb_s->eca = 0;
+       scb_s->ecb = 0;
+       scb_s->ecb2 = 0;
+       scb_s->ecb3 = 0;
+       scb_s->ecd = 0;
+       scb_s->fac = 0;
+
+       rc = prepare_cpuflags(vcpu, vsie_page);
+       if (rc)
+               goto out;
+
+       /* timer */
+       scb_s->cputm = scb_o->cputm;
+       scb_s->ckc = scb_o->ckc;
+       scb_s->todpr = scb_o->todpr;
+       scb_s->epoch = scb_o->epoch;
+
+       /* guest state */
+       scb_s->gpsw = scb_o->gpsw;
+       scb_s->gg14 = scb_o->gg14;
+       scb_s->gg15 = scb_o->gg15;
+       memcpy(scb_s->gcr, scb_o->gcr, 128);
+       scb_s->pp = scb_o->pp;
+
+       /* interception / execution handling */
+       scb_s->gbea = scb_o->gbea;
+       scb_s->lctl = scb_o->lctl;
+       scb_s->svcc = scb_o->svcc;
+       scb_s->ictl = scb_o->ictl;
+       /*
+        * SKEY handling functions can't deal with false setting of PTE invalid
+        * bits. Therefore we cannot provide interpretation and would later
+        * have to provide own emulation handlers.
+        */
+       scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+       scb_s->icpua = scb_o->icpua;
+
+       if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+               new_mso = scb_o->mso & 0xfffffffffff00000UL;
+       /* if the hva of the prefix changes, we have to remap the prefix */
+       if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+               prefix_unmapped(vsie_page);
+        /* SIE will do mso/msl validity and exception checks for us */
+       scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+       scb_s->mso = new_mso;
+       scb_s->prefix = scb_o->prefix;
+
+       /* We have to definetly flush the tlb if this scb never ran */
+       if (scb_s->ihcpu != 0xffffU)
+               scb_s->ihcpu = scb_o->ihcpu;
+
+       /* MVPG and Protection Exception Interpretation are always available */
+       scb_s->eca |= scb_o->eca & 0x01002000U;
+       /* Host-protection-interruption introduced with ESOP */
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+               scb_s->ecb |= scb_o->ecb & 0x02U;
+       /* transactional execution */
+       if (test_kvm_facility(vcpu->kvm, 73)) {
+               /* remap the prefix is tx is toggled on */
+               if ((scb_o->ecb & 0x10U) && !had_tx)
+                       prefix_unmapped(vsie_page);
+               scb_s->ecb |= scb_o->ecb & 0x10U;
+       }
+       /* SIMD */
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               scb_s->eca |= scb_o->eca & 0x00020000U;
+               scb_s->ecd |= scb_o->ecd & 0x20000000U;
+       }
+       /* Run-time-Instrumentation */
+       if (test_kvm_facility(vcpu->kvm, 64))
+               scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+               scb_s->eca |= scb_o->eca & 0x00000001U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+               scb_s->eca |= scb_o->eca & 0x40000000U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+               scb_s->eca |= scb_o->eca & 0x80000000U;
+
+       prepare_ibc(vcpu, vsie_page);
+       rc = shadow_crycb(vcpu, vsie_page);
+out:
+       if (rc)
+               unshadow_scb(vcpu, vsie_page);
+       return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+                                unsigned long end)
+{
+       struct kvm *kvm = gmap->private;
+       struct vsie_page *cur;
+       unsigned long prefix;
+       struct page *page;
+       int i;
+
+       if (!gmap_is_shadow(gmap))
+               return;
+       if (start >= 1UL << 31)
+               /* We are only interested in prefix pages */
+               return;
+
+       /*
+        * Only new shadow blocks are added to the list during runtime,
+        * therefore we can safely reference them all the time.
+        */
+       for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+               page = READ_ONCE(kvm->arch.vsie.pages[i]);
+               if (!page)
+                       continue;
+               cur = page_to_virt(page);
+               if (READ_ONCE(cur->gmap) != gmap)
+                       continue;
+               prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+               /* with mso/msl, the prefix lies at an offset */
+               prefix += cur->scb_s.mso;
+               if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+                       prefix_unmapped_sync(cur);
+       }
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ *          - > 0 if control has to be given to guest 2
+ *          - -EAGAIN if the caller can retry immediately
+ *          - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+       int rc;
+
+       if (prefix_is_mapped(vsie_page))
+               return 0;
+
+       /* mark it as mapped so we can catch any concurrent unmappers */
+       prefix_mapped(vsie_page);
+
+       /* with mso/msl, the prefix lies at offset *mso* */
+       prefix += scb_s->mso;
+
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+       if (!rc && (scb_s->ecb & 0x10U))
+               rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                          prefix + PAGE_SIZE);
+       /*
+        * We don't have to mprotect, we will be called for all unshadows.
+        * SIE will detect if protection applies and trigger a validity.
+        */
+       if (rc)
+               prefix_unmapped(vsie_page);
+       if (rc > 0 || rc == -EFAULT)
+               rc = set_validity_icpt(scb_s, 0x0037U);
+       return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ *          - -EINVAL if the gpa is not valid guest storage
+ *          - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+       struct page *page;
+       hva_t hva;
+       int rc;
+
+       hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+       if (kvm_is_error_hva(hva))
+               return -EINVAL;
+       rc = get_user_pages_fast(hva, 1, 1, &page);
+       if (rc < 0)
+               return rc;
+       else if (rc != 1)
+               return -ENOMEM;
+       *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+       return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+       struct page *page;
+
+       page = virt_to_page(hpa);
+       set_page_dirty_lock(page);
+       put_page(page);
+       /* mark the page always as dirty for migration */
+       mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       hpa_t hpa;
+       gpa_t gpa;
+
+       hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+       if (hpa) {
+               gpa = scb_o->scaol & ~0xfUL;
+               if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+                       gpa |= (u64) scb_o->scaoh << 32;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->scaol = 0;
+               scb_s->scaoh = 0;
+       }
+
+       hpa = scb_s->itdba;
+       if (hpa) {
+               gpa = scb_o->itdba & ~0xffUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->itdba = 0;
+       }
+
+       hpa = scb_s->gvrd;
+       if (hpa) {
+               gpa = scb_o->gvrd & ~0x1ffUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->gvrd = 0;
+       }
+
+       hpa = scb_s->riccbd;
+       if (hpa) {
+               gpa = scb_o->riccbd & ~0x3fUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->riccbd = 0;
+       }
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       hpa_t hpa;
+       gpa_t gpa;
+       int rc = 0;
+
+       gpa = scb_o->scaol & ~0xfUL;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+               gpa |= (u64) scb_o->scaoh << 32;
+       if (gpa) {
+               if (!(gpa & ~0x1fffUL))
+                       rc = set_validity_icpt(scb_s, 0x0038U);
+               else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+                       rc = set_validity_icpt(scb_s, 0x0011U);
+               else if ((gpa & PAGE_MASK) !=
+                        ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+                       rc = set_validity_icpt(scb_s, 0x003bU);
+               if (!rc) {
+                       rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+                       if (rc == -EINVAL)
+                               rc = set_validity_icpt(scb_s, 0x0034U);
+               }
+               if (rc)
+                       goto unpin;
+               scb_s->scaoh = (u32)((u64)hpa >> 32);
+               scb_s->scaol = (u32)(u64)hpa;
+       }
+
+       gpa = scb_o->itdba & ~0xffUL;
+       if (gpa && (scb_s->ecb & 0x10U)) {
+               if (!(gpa & ~0x1fffU)) {
+                       rc = set_validity_icpt(scb_s, 0x0080U);
+                       goto unpin;
+               }
+               /* 256 bytes cannot cross page boundaries */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x0080U);
+               if (rc)
+                       goto unpin;
+               scb_s->itdba = hpa;
+       }
+
+       gpa = scb_o->gvrd & ~0x1ffUL;
+       if (gpa && (scb_s->eca & 0x00020000U) &&
+           !(scb_s->ecd & 0x20000000U)) {
+               if (!(gpa & ~0x1fffUL)) {
+                       rc = set_validity_icpt(scb_s, 0x1310U);
+                       goto unpin;
+               }
+               /*
+                * 512 bytes vector registers cannot cross page boundaries
+                * if this block gets bigger, we have to shadow it.
+                */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x1310U);
+               if (rc)
+                       goto unpin;
+               scb_s->gvrd = hpa;
+       }
+
+       gpa = scb_o->riccbd & ~0x3fUL;
+       if (gpa && (scb_s->ecb3 & 0x01U)) {
+               if (!(gpa & ~0x1fffUL)) {
+                       rc = set_validity_icpt(scb_s, 0x0043U);
+                       goto unpin;
+               }
+               /* 64 bytes cannot cross page boundaries */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x0043U);
+               /* Validity 0x0044 will be checked by SIE */
+               if (rc)
+                       goto unpin;
+               scb_s->gvrd = hpa;
+       }
+       return 0;
+unpin:
+       unpin_blocks(vcpu, vsie_page);
+       return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+                     gpa_t gpa)
+{
+       hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+       if (hpa)
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+       vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+                  gpa_t gpa)
+{
+       hpa_t hpa;
+       int rc;
+
+       rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+       if (rc == -EINVAL) {
+               rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+               if (!rc)
+                       rc = 1;
+       }
+       if (!rc)
+               vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+       return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ *            < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+                       bool write_flag)
+{
+       struct kvm_s390_pgm_info pgm = {
+               .code = code,
+               .trans_exc_code =
+                       /* 0-51: virtual address */
+                       (vaddr & 0xfffffffffffff000UL) |
+                       /* 52-53: store / fetch */
+                       (((unsigned int) !write_flag) + 1) << 10,
+                       /* 62-63: asce id (alway primary == 0) */
+               .exc_access_id = 0, /* always primary */
+               .op_access_id = 0, /* not MVPG */
+       };
+       int rc;
+
+       if (code == PGM_PROTECTION)
+               pgm.trans_exc_code |= 0x4UL;
+
+       rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+       return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       int rc;
+
+       if (current->thread.gmap_int_code == PGM_PROTECTION)
+               /* we can directly forward all protection exceptions */
+               return inject_fault(vcpu, PGM_PROTECTION,
+                                   current->thread.gmap_addr, 1);
+
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                  current->thread.gmap_addr);
+       if (rc > 0) {
+               rc = inject_fault(vcpu, rc,
+                                 current->thread.gmap_addr,
+                                 current->thread.gmap_write_flag);
+               if (rc >= 0)
+                       vsie_page->fault_addr = current->thread.gmap_addr;
+       }
+       return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+                             struct vsie_page *vsie_page)
+{
+       if (vsie_page->fault_addr)
+               kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                     vsie_page->fault_addr);
+       vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+       vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       int ilen = insn_length(scb_s->ipa >> 8);
+
+       /* take care of EXECUTE instructions */
+       if (scb_s->icptstatus & 1) {
+               ilen = (scb_s->icptstatus >> 4) & 0x6;
+               if (!ilen)
+                       ilen = 4;
+       }
+       scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+       clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ *          - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+       if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+               retry_vsie_icpt(vsie_page);
+               if (read_guest_real(vcpu, fac, &vsie_page->fac,
+                                   sizeof(vsie_page->fac)))
+                       return set_validity_icpt(scb_s, 0x1090U);
+               scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+       }
+       return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       int rc;
+
+       handle_last_fault(vcpu, vsie_page);
+
+       if (need_resched())
+               schedule();
+       if (test_cpu_flag(CIF_MCCK_PENDING))
+               s390_handle_mcck();
+
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+       local_irq_disable();
+       guest_enter_irqoff();
+       local_irq_enable();
+
+       rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+       local_irq_disable();
+       guest_exit_irqoff();
+       local_irq_enable();
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       if (rc > 0)
+               rc = 0; /* we could still have an icpt */
+       else if (rc == -EFAULT)
+               return handle_fault(vcpu, vsie_page);
+
+       switch (scb_s->icptcode) {
+       case ICPT_INST:
+               if (scb_s->ipa == 0xb2b0)
+                       rc = handle_stfle(vcpu, vsie_page);
+               break;
+       case ICPT_STOP:
+               /* stop not requested by g2 - must have been a kick */
+               if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+                       clear_vsie_icpt(vsie_page);
+               break;
+       case ICPT_VALIDITY:
+               if ((scb_s->ipa & 0xf000) != 0xf000)
+                       scb_s->ipa += 0x1000;
+               break;
+       }
+       return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+       if (vsie_page->gmap)
+               gmap_put(vsie_page->gmap);
+       WRITE_ONCE(vsie_page->gmap, NULL);
+       prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+                              struct vsie_page *vsie_page)
+{
+       unsigned long asce;
+       union ctlreg0 cr0;
+       struct gmap *gmap;
+       int edat;
+
+       asce = vcpu->arch.sie_block->gcr[1];
+       cr0.val = vcpu->arch.sie_block->gcr[0];
+       edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+       edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+       /*
+        * ASCE or EDAT could have changed since last icpt, or the gmap
+        * we're holding has been unshadowed. If the gmap is still valid,
+        * we can safely reuse it.
+        */
+       if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+               return 0;
+
+       /* release the old shadow - if any, and mark the prefix as unmapped */
+       release_gmap_shadow(vsie_page);
+       gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+       if (IS_ERR(gmap))
+               return PTR_ERR(gmap);
+       gmap->private = vcpu->kvm;
+       WRITE_ONCE(vsie_page->gmap, gmap);
+       return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+                               struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+       WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+       /*
+        * External calls have to lead to a kick of the vcpu and
+        * therefore the vsie -> Simulate Wait state.
+        */
+       atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+       /*
+        * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+        * automatically be adjusted on tod clock changes via kvm_sync_clock.
+        */
+       preempt_disable();
+       scb_s->epoch += vcpu->kvm->arch.epoch;
+       preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+       atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+       WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       int rc = 0;
+
+       while (1) {
+               rc = acquire_gmap_shadow(vcpu, vsie_page);
+               if (!rc)
+                       rc = map_prefix(vcpu, vsie_page);
+               if (!rc) {
+                       gmap_enable(vsie_page->gmap);
+                       update_intervention_requests(vsie_page);
+                       rc = do_vsie_run(vcpu, vsie_page);
+                       gmap_enable(vcpu->arch.gmap);
+               }
+               atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+               if (rc == -EAGAIN)
+                       rc = 0;
+               if (rc || scb_s->icptcode || signal_pending(current) ||
+                   kvm_s390_vcpu_has_irq(vcpu, 0))
+                       break;
+       };
+
+       if (rc == -EFAULT) {
+               /*
+                * Addressing exceptions are always presentes as intercepts.
+                * As addressing exceptions are suppressing and our guest 3 PSW
+                * points at the responsible instruction, we have to
+                * forward the PSW and set the ilc. If we can't read guest 3
+                * instruction, we can use an arbitrary ilc. Let's always use
+                * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+                * memory. (we could also fake the shadow so the hardware
+                * handles it).
+                */
+               scb_s->icptcode = ICPT_PROGI;
+               scb_s->iprcc = PGM_ADDRESSING;
+               scb_s->pgmilc = 4;
+               scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+       }
+       return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ *          - NULL if the same scb address is already used by another VCPU
+ *          - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+       struct vsie_page *vsie_page;
+       struct page *page;
+       int nr_vcpus;
+
+       rcu_read_lock();
+       page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+       rcu_read_unlock();
+       if (page) {
+               if (page_ref_inc_return(page) == 2)
+                       return page_to_virt(page);
+               page_ref_dec(page);
+       }
+
+       /*
+        * We want at least #online_vcpus shadows, so every VCPU can execute
+        * the VSIE in parallel.
+        */
+       nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+       mutex_lock(&kvm->arch.vsie.mutex);
+       if (kvm->arch.vsie.page_count < nr_vcpus) {
+               page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+               if (!page) {
+                       mutex_unlock(&kvm->arch.vsie.mutex);
+                       return ERR_PTR(-ENOMEM);
+               }
+               page_ref_inc(page);
+               kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+               kvm->arch.vsie.page_count++;
+       } else {
+               /* reuse an existing entry that belongs to nobody */
+               while (true) {
+                       page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+                       if (page_ref_inc_return(page) == 2)
+                               break;
+                       page_ref_dec(page);
+                       kvm->arch.vsie.next++;
+                       kvm->arch.vsie.next %= nr_vcpus;
+               }
+               radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+       }
+       page->index = addr;
+       /* double use of the same address */
+       if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+               page_ref_dec(page);
+               mutex_unlock(&kvm->arch.vsie.mutex);
+               return NULL;
+       }
+       mutex_unlock(&kvm->arch.vsie.mutex);
+
+       vsie_page = page_to_virt(page);
+       memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+       release_gmap_shadow(vsie_page);
+       vsie_page->fault_addr = 0;
+       vsie_page->scb_s.ihcpu = 0xffffU;
+       return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+       struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+       page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+       struct vsie_page *vsie_page;
+       unsigned long scb_addr;
+       int rc;
+
+       vcpu->stat.instruction_sie++;
+       if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+               return -EOPNOTSUPP;
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+       BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+       scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+       /* 512 byte alignment */
+       if (unlikely(scb_addr & 0x1ffUL))
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+               return 0;
+
+       vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+       if (IS_ERR(vsie_page))
+               return PTR_ERR(vsie_page);
+       else if (!vsie_page)
+               /* double use of sie control block - simply do nothing */
+               return 0;
+
+       rc = pin_scb(vcpu, vsie_page, scb_addr);
+       if (rc)
+               goto out_put;
+       rc = shadow_scb(vcpu, vsie_page);
+       if (rc)
+               goto out_unpin_scb;
+       rc = pin_blocks(vcpu, vsie_page);
+       if (rc)
+               goto out_unshadow;
+       register_shadow_scb(vcpu, vsie_page);
+       rc = vsie_run(vcpu, vsie_page);
+       unregister_shadow_scb(vcpu);
+       unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+       unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+       unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+       put_vsie_page(vcpu->kvm, vsie_page);
+
+       return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+       mutex_init(&kvm->arch.vsie.mutex);
+       INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+       struct vsie_page *vsie_page;
+       struct page *page;
+       int i;
+
+       mutex_lock(&kvm->arch.vsie.mutex);
+       for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+               page = kvm->arch.vsie.pages[i];
+               kvm->arch.vsie.pages[i] = NULL;
+               vsie_page = page_to_virt(page);
+               release_gmap_shadow(vsie_page);
+               /* free the radix tree entry */
+               radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+               __free_page(page);
+       }
+       kvm->arch.vsie.page_count = 0;
+       mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+       /*
+        * Even if the VCPU lets go of the shadow sie block reference, it is
+        * still valid in the cache. So we can safely kick it.
+        */
+       if (scb) {
+               atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+               if (scb->prog0c & PROG_IN_SIE)
+                       atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+       }
+}
index 25783dc..a58bca6 100644 (file)
@@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
                (struct gmap *) S390_lowcore.gmap : NULL;
        if (gmap) {
                current->thread.gmap_addr = address;
+               current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+               current->thread.gmap_int_code = regs->int_code & 0xffff;
                address = __gmap_translate(gmap, address);
                if (address == -EFAULT) {
                        fault = VM_FAULT_BADMAP;
index 063c721..2ce6bb3 100644 (file)
 #include <asm/gmap.h>
 #include <asm/tlb.h>
 
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
 /**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
  * @mm: pointer to the parent mm_struct
  * @limit: maximum address of the gmap address space
  *
  * Returns a guest address space structure.
  */
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
 {
        struct gmap *gmap;
        struct page *page;
@@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
        if (!gmap)
                goto out;
        INIT_LIST_HEAD(&gmap->crst_list);
+       INIT_LIST_HEAD(&gmap->children);
+       INIT_LIST_HEAD(&gmap->pt_list);
        INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
        INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+       INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
        spin_lock_init(&gmap->guest_table_lock);
-       gmap->mm = mm;
+       spin_lock_init(&gmap->shadow_lock);
+       atomic_set(&gmap->ref_count, 1);
        page = alloc_pages(GFP_KERNEL, 2);
        if (!page)
                goto out_free;
@@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
        gmap->asce = atype | _ASCE_TABLE_LENGTH |
                _ASCE_USER_BITS | __pa(table);
        gmap->asce_end = limit;
-       down_write(&mm->mmap_sem);
-       list_add(&gmap->list, &mm->context.gmap_list);
-       up_write(&mm->mmap_sem);
        return gmap;
 
 out_free:
@@ -80,7 +83,28 @@ out_free:
 out:
        return NULL;
 }
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+       struct gmap *gmap;
+
+       gmap = gmap_alloc(limit);
+       if (!gmap)
+               return NULL;
+       gmap->mm = mm;
+       spin_lock(&mm->context.gmap_lock);
+       list_add_rcu(&gmap->list, &mm->context.gmap_list);
+       spin_unlock(&mm->context.gmap_lock);
+       return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
 
 static void gmap_flush_tlb(struct gmap *gmap)
 {
@@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
        } while (nr > 0);
 }
 
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+       struct gmap_rmap *rmap, *rnext, *head;
+       struct radix_tree_iter iter;
+       unsigned long indices[16];
+       unsigned long index;
+       void **slot;
+       int i, nr;
+
+       /* A radix tree is freed by deleting all of its entries */
+       index = 0;
+       do {
+               nr = 0;
+               radix_tree_for_each_slot(slot, root, &iter, index) {
+                       indices[nr] = iter.index;
+                       if (++nr == 16)
+                               break;
+               }
+               for (i = 0; i < nr; i++) {
+                       index = indices[i];
+                       head = radix_tree_delete(root, index);
+                       gmap_for_each_rmap_safe(rmap, rnext, head)
+                               kfree(rmap);
+               }
+       } while (nr > 0);
+}
+
 /**
  * gmap_free - free a guest address space
  * @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
  */
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
 {
        struct page *page, *next;
 
-       /* Flush tlb. */
-       if (MACHINE_HAS_IDTE)
-               __tlb_flush_idte(gmap->asce);
-       else
-               __tlb_flush_global();
-
+       /* Flush tlb of all gmaps (if not already done for shadows) */
+       if (!(gmap_is_shadow(gmap) && gmap->removed))
+               gmap_flush_tlb(gmap);
        /* Free all segment & region tables. */
        list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
                __free_pages(page, 2);
        gmap_radix_tree_free(&gmap->guest_to_host);
        gmap_radix_tree_free(&gmap->host_to_guest);
-       down_write(&gmap->mm->mmap_sem);
-       list_del(&gmap->list);
-       up_write(&gmap->mm->mmap_sem);
+
+       /* Free additional data for a shadow gmap */
+       if (gmap_is_shadow(gmap)) {
+               /* Free all page tables. */
+               list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+                       page_table_free_pgste(page);
+               gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+               /* Release reference to the parent */
+               gmap_put(gmap->parent);
+       }
+
        kfree(gmap);
 }
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+       atomic_inc(&gmap->ref_count);
+       return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+       if (atomic_dec_return(&gmap->ref_count) == 0)
+               gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+       struct gmap *sg, *next;
+
+       /* Remove all shadow gmaps linked to this gmap */
+       if (!list_empty(&gmap->children)) {
+               spin_lock(&gmap->shadow_lock);
+               list_for_each_entry_safe(sg, next, &gmap->children, list) {
+                       list_del(&sg->list);
+                       gmap_put(sg);
+               }
+               spin_unlock(&gmap->shadow_lock);
+       }
+       /* Remove gmap from the pre-mm list */
+       spin_lock(&gmap->mm->context.gmap_lock);
+       list_del_rcu(&gmap->list);
+       spin_unlock(&gmap->mm->context.gmap_lock);
+       synchronize_rcu();
+       /* Put reference */
+       gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
 
 /**
  * gmap_enable - switch primary space to the guest address space
@@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
 }
 EXPORT_SYMBOL_GPL(gmap_disable);
 
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+       return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
 /*
  * gmap_alloc_table is assumed to be called with mmap_sem held
  */
@@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
                return -ENOMEM;
        new = (unsigned long *) page_to_phys(page);
        crst_table_init(new, init);
-       spin_lock(&gmap->mm->page_table_lock);
+       spin_lock(&gmap->guest_table_lock);
        if (*table & _REGION_ENTRY_INVALID) {
                list_add(&page->lru, &gmap->crst_list);
                *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
                page->index = gaddr;
                page = NULL;
        }
-       spin_unlock(&gmap->mm->page_table_lock);
+       spin_unlock(&gmap->guest_table_lock);
        if (page)
                __free_pages(page, 2);
        return 0;
@@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
        unsigned long *entry;
        int flush = 0;
 
+       BUG_ON(gmap_is_shadow(gmap));
        spin_lock(&gmap->guest_table_lock);
        entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
        if (entry) {
@@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
        unsigned long off;
        int flush;
 
+       BUG_ON(gmap_is_shadow(gmap));
        if ((to | len) & (PMD_SIZE - 1))
                return -EINVAL;
        if (len == 0 || to + len < to)
@@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
        unsigned long off;
        int flush;
 
+       BUG_ON(gmap_is_shadow(gmap));
        if ((from | to | len) & (PMD_SIZE - 1))
                return -EINVAL;
        if (len == 0 || from + len < from || to + len < to ||
@@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
  * This function does not establish potentially missing page table entries.
  * The mmap_sem of the mm that belongs to the address space must be held
  * when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
  */
 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 {
@@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 
        vmaddr = (unsigned long)
                radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+       /* Note: guest_to_host is empty for a shadow gmap */
        return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 }
 EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
        struct gmap *gmap;
        int flush;
 
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
                flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
                if (flush)
                        gmap_flush_tlb(gmap);
        }
+       rcu_read_unlock();
 }
 
 /**
@@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
        pmd_t *pmd;
        int rc;
 
+       BUG_ON(gmap_is_shadow(gmap));
        /* Create higher level tables in the gmap page table */
        table = gmap->table;
        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -552,116 +682,1412 @@ static LIST_HEAD(gmap_notifier_list);
 static DEFINE_SPINLOCK(gmap_notifier_lock);
 
 /**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
  * @nb: pointer to the gmap notifier block
  */
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
 {
        spin_lock(&gmap_notifier_lock);
-       list_add(&nb->list, &gmap_notifier_list);
+       list_add_rcu(&nb->list, &gmap_notifier_list);
        spin_unlock(&gmap_notifier_lock);
 }
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
 
 /**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
  * @nb: pointer to the gmap notifier block
  */
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
 {
        spin_lock(&gmap_notifier_lock);
-       list_del_init(&nb->list);
+       list_del_rcu(&nb->list);
        spin_unlock(&gmap_notifier_lock);
+       synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+                              unsigned long end)
+{
+       struct gmap_notifier *nb;
+
+       list_for_each_entry(nb, &gmap_notifier_list, list)
+               nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+                                            unsigned long gaddr, int level)
+{
+       unsigned long *table;
+
+       if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+               return NULL;
+       if (gmap_is_shadow(gmap) && gmap->removed)
+               return NULL;
+       if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+               return NULL;
+       table = gmap->table;
+       switch (gmap->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               table += (gaddr >> 53) & 0x7ff;
+               if (level == 4)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION2:
+               table += (gaddr >> 42) & 0x7ff;
+               if (level == 3)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION3:
+               table += (gaddr >> 31) & 0x7ff;
+               if (level == 2)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_SEGMENT:
+               table += (gaddr >> 20) & 0x7ff;
+               if (level == 1)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+               table += (gaddr >> 12) & 0xff;
+       }
+       return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ *                   and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+                              spinlock_t **ptl)
+{
+       unsigned long *table;
+
+       if (gmap_is_shadow(gmap))
+               spin_lock(&gmap->guest_table_lock);
+       /* Walk the gmap page table, lock and get pte pointer */
+       table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+       if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+               if (gmap_is_shadow(gmap))
+                       spin_unlock(&gmap->guest_table_lock);
+               return NULL;
+       }
+       if (gmap_is_shadow(gmap)) {
+               *ptl = &gmap->guest_table_lock;
+               return pte_offset_map((pmd_t *) table, gaddr);
+       }
+       return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+                            unsigned long vmaddr, int prot)
+{
+       struct mm_struct *mm = gmap->mm;
+       unsigned int fault_flags;
+       bool unlocked = false;
+
+       BUG_ON(gmap_is_shadow(gmap));
+       fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+       if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+               return -EFAULT;
+       if (unlocked)
+               /* lost mmap_sem, caller has to retry __gmap_translate */
+               return 0;
+       /* Connect the page tables */
+       return __gmap_link(gmap, gaddr, vmaddr);
 }
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 
 /**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+       spin_unlock(ptl);
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: virtual address in the guest address space
  * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
  *
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ *
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
  */
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+                             unsigned long len, int prot, unsigned long bits)
 {
-       unsigned long addr;
+       unsigned long vmaddr;
        spinlock_t *ptl;
        pte_t *ptep;
-       bool unlocked;
-       int rc = 0;
+       int rc;
+
+       while (len) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+                       gmap_pte_op_end(ptl);
+               }
+               if (rc) {
+                       vmaddr = __gmap_translate(gmap, gaddr);
+                       if (IS_ERR_VALUE(vmaddr))
+                               return vmaddr;
+                       rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
+                       continue;
+               }
+               gaddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
+       }
+       return 0;
+}
+
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ *                        call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+                        unsigned long len, int prot)
+{
+       int rc;
 
-       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+               return -EINVAL;
+       if (!MACHINE_HAS_ESOP && prot == PROT_READ)
                return -EINVAL;
        down_read(&gmap->mm->mmap_sem);
-       while (len) {
-               unlocked = false;
-               /* Convert gmap address and connect the page tables */
-               addr = __gmap_translate(gmap, gaddr);
-               if (IS_ERR_VALUE(addr)) {
-                       rc = addr;
-                       break;
+       rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+       up_read(&gmap->mm->mmap_sem);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ *                   absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+       unsigned long address, vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep, pte;
+       int rc;
+
+       while (1) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       pte = *ptep;
+                       if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+                               address = pte_val(pte) & PAGE_MASK;
+                               address += gaddr & ~PAGE_MASK;
+                               *val = *(unsigned long *) address;
+                               pte_val(*ptep) |= _PAGE_YOUNG;
+                               /* Do *NOT* clear the _PAGE_INVALID bit! */
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
                }
-               /* Get the page mapped */
-               if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-                                    &unlocked)) {
-                       rc = -EFAULT;
+               if (!rc)
+                       break;
+               vmaddr = __gmap_translate(gmap, gaddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
                        break;
                }
-               /* While trying to map mmap_sem got unlocked. Let us retry */
-               if (unlocked)
-                       continue;
-               rc = __gmap_link(gmap, gaddr, addr);
+               rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
                if (rc)
                        break;
-               /* Walk the process page table, lock and get pte pointer */
-               ptep = get_locked_pte(gmap->mm, addr, &ptl);
-               VM_BUG_ON(!ptep);
-               /* Set notification bit in the pgste of the pte */
-               if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
-                       ptep_set_notify(gmap->mm, addr, ptep);
-                       gaddr += PAGE_SIZE;
-                       len -= PAGE_SIZE;
-               }
-               pte_unmap_unlock(ptep, ptl);
        }
-       up_read(&gmap->mm->mmap_sem);
        return rc;
 }
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_read_table);
 
 /**
- * ptep_notify - call all invalidation callbacks for a specific pte.
- * @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
- * @pte: pointer to the page table entry
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
  *
- * This function is assumed to be called with the page table lock held
- * for the pte to notify.
+ * Called with the sg->guest_table_lock
  */
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+                                   struct gmap_rmap *rmap)
 {
-       unsigned long offset, gaddr;
-       unsigned long *table;
-       struct gmap_notifier *nb;
-       struct gmap *gmap;
+       void **slot;
 
-       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
-       offset = offset * (4096 / sizeof(pte_t));
-       spin_lock(&gmap_notifier_lock);
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
-               table = radix_tree_lookup(&gmap->host_to_guest,
-                                         vmaddr >> PMD_SHIFT);
-               if (!table)
+       BUG_ON(!gmap_is_shadow(sg));
+       slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+       if (slot) {
+               rmap->next = radix_tree_deref_slot_protected(slot,
+                                                       &sg->guest_table_lock);
+               radix_tree_replace_slot(slot, rmap);
+       } else {
+               rmap->next = NULL;
+               radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+                                 rmap);
+       }
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+                            unsigned long paddr, unsigned long len, int prot)
+{
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       while (len) {
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr))
+                       return vmaddr;
+               rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+               if (!rmap)
+                       return -ENOMEM;
+               rmap->raddr = raddr;
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc) {
+                       kfree(rmap);
+                       return rc;
+               }
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (ptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+                                            PGSTE_VSIE_BIT);
+                       if (!rc)
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                       spin_unlock(&sg->guest_table_lock);
+                       gmap_pte_op_end(ptl);
+               }
+               radix_tree_preload_end();
+               if (rc) {
+                       kfree(rmap);
+                       rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
                        continue;
-               gaddr = __gmap_segment_gaddr(table) + offset;
-               list_for_each_entry(nb, &gmap_notifier_list, list)
-                       nb->notifier_call(gmap, gaddr);
+               }
+               paddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
        }
-       spin_unlock(&gmap_notifier_lock);
+       return 0;
+}
+
+#define _SHADOW_RMAP_MASK      0x7
+#define _SHADOW_RMAP_REGION1   0x5
+#define _SHADOW_RMAP_REGION2   0x4
+#define _SHADOW_RMAP_REGION3   0x3
+#define _SHADOW_RMAP_SEGMENT   0x2
+#define _SHADOW_RMAP_PGTABLE   0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,%0,%1,0,0"
+               : : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+       if (!table || *table & _PAGE_INVALID)
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+       ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *pgt)
+{
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       for (i = 0; i < 256; i++, raddr += 1UL << 12)
+               pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long sto, *ste, *pgt;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+       if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+       sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+       gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+       pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+       *ste = _SEGMENT_ENTRY_EMPTY;
+       __gmap_unshadow_pgt(sg, raddr, pgt);
+       /* Free page table */
+       page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *sgt)
+{
+       unsigned long asce, *pgt;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+               if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+                       continue;
+               pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+               sgt[i] = _SEGMENT_ENTRY_EMPTY;
+               __gmap_unshadow_pgt(sg, raddr, pgt);
+               /* Free page table */
+               page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               page_table_free_pgste(page);
+       }
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r3o, *r3e, *sgt;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+       if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+       r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+       gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+       sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+       *r3e = _REGION3_ENTRY_EMPTY;
+       __gmap_unshadow_sgt(sg, raddr, sgt);
+       /* Free segment table */
+       page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r3t)
+{
+       unsigned long asce, *sgt;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+               if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+               r3t[i] = _REGION3_ENTRY_EMPTY;
+               __gmap_unshadow_sgt(sg, raddr, sgt);
+               /* Free segment table */
+               page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r2o, *r2e, *r3t;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+       if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+       r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+       gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+       r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+       *r2e = _REGION2_ENTRY_EMPTY;
+       __gmap_unshadow_r3t(sg, raddr, r3t);
+       /* Free region 3 table */
+       page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r2t)
+{
+       unsigned long asce, *r3t;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+               if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+               r2t[i] = _REGION2_ENTRY_EMPTY;
+               __gmap_unshadow_r3t(sg, raddr, r3t);
+               /* Free region 3 table */
+               page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r1o, *r1e, *r2t;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+       if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+       r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+       gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+       r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+       *r1e = _REGION1_ENTRY_EMPTY;
+       __gmap_unshadow_r2t(sg, raddr, r2t);
+       /* Free region 2 table */
+       page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r1t)
+{
+       unsigned long asce, *r2t;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+               if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+               __gmap_unshadow_r2t(sg, raddr, r2t);
+               /* Clear entry and flush translation r1t -> r2t */
+               gmap_idte_one(asce, raddr);
+               r1t[i] = _REGION1_ENTRY_EMPTY;
+               /* Free region 2 table */
+               page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       if (sg->removed)
+               return;
+       sg->removed = 1;
+       gmap_call_notifier(sg, 0, -1UL);
+       gmap_flush_tlb(sg);
+       table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+       switch (sg->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               __gmap_unshadow_r1t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION2:
+               __gmap_unshadow_r2t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION3:
+               __gmap_unshadow_r3t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_SEGMENT:
+               __gmap_unshadow_sgt(sg, 0, table);
+               break;
+       }
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+                                    int edat_level)
+{
+       struct gmap *sg;
+
+       list_for_each_entry(sg, &parent->children, list) {
+               if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+                   sg->removed)
+                       continue;
+               if (!sg->initialized)
+                       return ERR_PTR(-EAGAIN);
+               atomic_inc(&sg->ref_count);
+               return sg;
+       }
+       return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ *                     given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+       if (sg->removed)
+               return 0;
+       return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+                        int edat_level)
+{
+       struct gmap *sg, *new;
+       unsigned long limit;
+       int rc;
+
+       BUG_ON(gmap_is_shadow(parent));
+       spin_lock(&parent->shadow_lock);
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       spin_unlock(&parent->shadow_lock);
+       if (sg)
+               return sg;
+       /* Create a new shadow gmap */
+       limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+       if (asce & _ASCE_REAL_SPACE)
+               limit = -1UL;
+       new = gmap_alloc(limit);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+       new->mm = parent->mm;
+       new->parent = gmap_get(parent);
+       new->orig_asce = asce;
+       new->edat_level = edat_level;
+       new->initialized = false;
+       spin_lock(&parent->shadow_lock);
+       /* Recheck if another CPU created the same shadow */
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       if (sg) {
+               spin_unlock(&parent->shadow_lock);
+               gmap_free(new);
+               return sg;
+       }
+       if (asce & _ASCE_REAL_SPACE) {
+               /* only allow one real-space gmap shadow */
+               list_for_each_entry(sg, &parent->children, list) {
+                       if (sg->orig_asce & _ASCE_REAL_SPACE) {
+                               spin_lock(&sg->guest_table_lock);
+                               gmap_unshadow(sg);
+                               spin_unlock(&sg->guest_table_lock);
+                               list_del(&sg->list);
+                               gmap_put(sg);
+                               break;
+                       }
+               }
+       }
+       atomic_set(&new->ref_count, 2);
+       list_add(&new->list, &parent->children);
+       if (asce & _ASCE_REAL_SPACE) {
+               /* nothing to protect, return right away */
+               new->initialized = true;
+               spin_unlock(&parent->shadow_lock);
+               return new;
+       }
+       spin_unlock(&parent->shadow_lock);
+       /* protect after insertion, so it will get properly invalidated */
+       down_read(&parent->mm->mmap_sem);
+       rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+                               ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+                               PROT_READ, PGSTE_VSIE_BIT);
+       up_read(&parent->mm->mmap_sem);
+       spin_lock(&parent->shadow_lock);
+       new->initialized = true;
+       if (rc) {
+               list_del(&new->list);
+               gmap_free(new);
+               new = ERR_PTR(rc);
+       }
+       spin_unlock(&parent->shadow_lock);
+       return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r2t, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r2t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r2t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r2t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r2t read-only in parent gmap page table */
+       raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+       origin = r2t & _REGION_ENTRY_ORIGIN;
+       offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 4);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r2t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r2t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r3t, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r3t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r3t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+       }
+       crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r3t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r3t read-only in parent gmap page table */
+       raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+       origin = r3t & _REGION_ENTRY_ORIGIN;
+       offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 3);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r3t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r3t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_sgt, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+       /* Allocate a shadow segment table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = sgt & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_sgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= sgt & _REGION_ENTRY_PROTECT;
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make sgt read-only in parent gmap page table */
+       raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+       origin = sgt & _REGION_ENTRY_ORIGIN;
+       offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 2);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_sgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_sgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+                          unsigned long *pgt, int *dat_protection,
+                          int *fake)
+{
+       unsigned long *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+               /* Shadow page tables are full pages (pte+pgste) */
+               page = pfn_to_page(*table >> PAGE_SHIFT);
+               *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+               *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+               *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+               rc = 0;
+       } else  {
+               rc = -EAGAIN;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+                   int fake)
+{
+       unsigned long raddr, origin;
+       unsigned long *s_pgt, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+       /* Allocate a shadow page table */
+       page = page_table_alloc_pgste(sg->mm);
+       if (!page)
+               return -ENOMEM;
+       page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_pgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow page table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+                (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+       list_add(&page->lru, &sg->pt_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_SEGMENT_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make pgt read-only in parent gmap page table (not the pgste) */
+       raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+       origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+       rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 1);
+               if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+                             (unsigned long) s_pgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_SEGMENT_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_pgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       page_table_free_pgste(page);
+       return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr, paddr;
+       spinlock_t *ptl;
+       pte_t *sptep, *tptep;
+       int prot;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+       rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+       if (!rmap)
+               return -ENOMEM;
+       rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+       while (1) {
+               paddr = pte_val(pte) & PAGE_MASK;
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
+                       break;
+               }
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc)
+                       break;
+               rc = -EAGAIN;
+               sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (sptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       /* Get page table pointer */
+                       tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+                       if (!tptep) {
+                               spin_unlock(&sg->guest_table_lock);
+                               gmap_pte_op_end(ptl);
+                               radix_tree_preload_end();
+                               break;
+                       }
+                       rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+                       if (rc > 0) {
+                               /* Success and a new mapping */
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                               rmap = NULL;
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
+                       spin_unlock(&sg->guest_table_lock);
+               }
+               radix_tree_preload_end();
+               if (!rc)
+                       break;
+               rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+               if (rc)
+                       break;
+       }
+       kfree(rmap);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+                              unsigned long offset, pte_t *pte)
+{
+       struct gmap_rmap *rmap, *rnext, *head;
+       unsigned long gaddr, start, end, bits, raddr;
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->parent->guest_table_lock);
+       table = radix_tree_lookup(&sg->parent->host_to_guest,
+                                 vmaddr >> PMD_SHIFT);
+       gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+       spin_unlock(&sg->parent->guest_table_lock);
+       if (!table)
+               return;
+
+       spin_lock(&sg->guest_table_lock);
+       if (sg->removed) {
+               spin_unlock(&sg->guest_table_lock);
+               return;
+       }
+       /* Check for top level table */
+       start = sg->orig_asce & _ASCE_ORIGIN;
+       end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+       if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+           gaddr < end) {
+               /* The complete shadow table has to go */
+               gmap_unshadow(sg);
+               spin_unlock(&sg->guest_table_lock);
+               list_del(&sg->list);
+               gmap_put(sg);
+               return;
+       }
+       /* Remove the page table tree from on specific entry */
+       head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+       gmap_for_each_rmap_safe(rmap, rnext, head) {
+               bits = rmap->raddr & _SHADOW_RMAP_MASK;
+               raddr = rmap->raddr ^ bits;
+               switch (bits) {
+               case _SHADOW_RMAP_REGION1:
+                       gmap_unshadow_r2t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION2:
+                       gmap_unshadow_r3t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION3:
+                       gmap_unshadow_sgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_SEGMENT:
+                       gmap_unshadow_pgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_PGTABLE:
+                       gmap_unshadow_page(sg, raddr);
+                       break;
+               }
+               kfree(rmap);
+       }
+       spin_unlock(&sg->guest_table_lock);
+}
+
+/**
+ * ptep_notify - call all invalidation callbacks for a specific pte.
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the process address space
+ * @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
+ *
+ * This function is assumed to be called with the page table lock held
+ * for the pte to notify.
+ */
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+                pte_t *pte, unsigned long bits)
+{
+       unsigned long offset, gaddr;
+       unsigned long *table;
+       struct gmap *gmap, *sg, *next;
+
+       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+       offset = offset * (4096 / sizeof(pte_t));
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+               if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+                       spin_lock(&gmap->shadow_lock);
+                       list_for_each_entry_safe(sg, next,
+                                                &gmap->children, list)
+                               gmap_shadow_notify(sg, vmaddr, offset, pte);
+                       spin_unlock(&gmap->shadow_lock);
+               }
+               if (!(bits & PGSTE_IN_BIT))
+                       continue;
+               spin_lock(&gmap->guest_table_lock);
+               table = radix_tree_lookup(&gmap->host_to_guest,
+                                         vmaddr >> PMD_SHIFT);
+               if (table)
+                       gaddr = __gmap_segment_gaddr(table) + offset;
+               spin_unlock(&gmap->guest_table_lock);
+               if (table)
+                       gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+       }
+       rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ptep_notify);
 
index e2565d2..995f785 100644 (file)
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
        return new;
 }
 
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+       struct page *page;
+       unsigned long *table;
+
+       page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+       if (page) {
+               table = (unsigned long *) page_to_phys(page);
+               clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+               clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+       }
+       return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+       __free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
 /*
  * page table entry allocation/free routines.
  */
@@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
        /* Try to get a fragment of a 4K page as a 2K page table */
        if (!mm_alloc_pgste(mm)) {
                table = NULL;
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                if (!list_empty(&mm->context.pgtable_list)) {
                        page = list_first_entry(&mm->context.pgtable_list,
                                                struct page, lru);
@@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
                                list_del(&page->lru);
                        }
                }
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                if (table)
                        return table;
        }
@@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
                /* Return the first 2K fragment of the page */
                atomic_set(&page->_mapcount, 1);
                clear_table(table, _PAGE_INVALID, PAGE_SIZE);
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                list_add(&page->lru, &mm->context.pgtable_list);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
        }
        return table;
 }
@@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
        if (!mm_alloc_pgste(mm)) {
                /* Free 2K page table fragment of a 4K page */
                bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
                if (mask & 3)
                        list_add(&page->lru, &mm->context.pgtable_list);
                else
                        list_del(&page->lru);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                if (mask != 0)
                        return;
        }
@@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
                return;
        }
        bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-       spin_lock_bh(&mm->context.list_lock);
+       spin_lock_bh(&mm->context.pgtable_lock);
        mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
        if (mask & 3)
                list_add_tail(&page->lru, &mm->context.pgtable_list);
        else
                list_del(&page->lru);
-       spin_unlock_bh(&mm->context.list_lock);
+       spin_unlock_bh(&mm->context.pgtable_lock);
        table = (unsigned long *) (__pa(table) | (1U << bit));
        tlb_remove_table(tlb, table);
 }
index b98d1a1..5f09201 100644 (file)
@@ -174,14 +174,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
        return pgste;
 }
 
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
-                                       unsigned long addr,
-                                       pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+                                      unsigned long addr,
+                                      pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
-       if (pgste_val(pgste) & PGSTE_IN_BIT) {
-               pgste_val(pgste) &= ~PGSTE_IN_BIT;
-               ptep_notify(mm, addr, ptep);
+       unsigned long bits;
+
+       bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+       if (bits) {
+               pgste_val(pgste) ^= bits;
+               ptep_notify(mm, addr, ptep, bits);
        }
 #endif
        return pgste;
@@ -194,7 +197,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
 
        if (mm_has_pgste(mm)) {
                pgste = pgste_get_lock(ptep);
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
        }
        return pgste;
 }
@@ -459,6 +462,90 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
        preempt_enable();
 }
 
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+                   pte_t *ptep, int prot, unsigned long bit)
+{
+       pte_t entry;
+       pgste_t pgste;
+       int pte_i, pte_p;
+
+       pgste = pgste_get_lock(ptep);
+       entry = *ptep;
+       /* Check pte entry after all locks have been acquired */
+       pte_i = pte_val(entry) & _PAGE_INVALID;
+       pte_p = pte_val(entry) & _PAGE_PROTECT;
+       if ((pte_i && (prot != PROT_NONE)) ||
+           (pte_p && (prot & PROT_WRITE))) {
+               pgste_set_unlock(ptep, pgste);
+               return -EAGAIN;
+       }
+       /* Change access rights and set pgste bit */
+       if (prot == PROT_NONE && !pte_i) {
+               ptep_flush_direct(mm, addr, ptep);
+               pgste = pgste_update_all(entry, pgste, mm);
+               pte_val(entry) |= _PAGE_INVALID;
+       }
+       if (prot == PROT_READ && !pte_p) {
+               ptep_flush_direct(mm, addr, ptep);
+               pte_val(entry) &= ~_PAGE_INVALID;
+               pte_val(entry) |= _PAGE_PROTECT;
+       }
+       pgste_val(pgste) |= bit;
+       pgste = pgste_set_pte(ptep, pgste, entry);
+       pgste_set_unlock(ptep, pgste);
+       return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+       pgste_t spgste, tpgste;
+       pte_t spte, tpte;
+       int rc = -EAGAIN;
+
+       if (!(pte_val(*tptep) & _PAGE_INVALID))
+               return 0;       /* already shadowed */
+       spgste = pgste_get_lock(sptep);
+       spte = *sptep;
+       if (!(pte_val(spte) & _PAGE_INVALID) &&
+           !((pte_val(spte) & _PAGE_PROTECT) &&
+             !(pte_val(pte) & _PAGE_PROTECT))) {
+               pgste_val(spgste) |= PGSTE_VSIE_BIT;
+               tpgste = pgste_get_lock(tptep);
+               pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+                               (pte_val(pte) & _PAGE_PROTECT);
+               /* don't touch the storage key - it belongs to parent pgste */
+               tpgste = pgste_set_pte(tptep, tpgste, tpte);
+               pgste_set_unlock(tptep, tpgste);
+               rc = 1;
+       }
+       pgste_set_unlock(sptep, spgste);
+       return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+       pgste_t pgste;
+
+       pgste = pgste_get_lock(ptep);
+       /* notifier is called by the caller */
+       ptep_flush_direct(mm, saddr, ptep);
+       /* don't touch the storage key - it belongs to parent pgste */
+       pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+       pgste_set_unlock(ptep, pgste);
+}
+
 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 {
        if (!non_swap_entry(entry))
@@ -532,7 +619,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
        pgste_val(pgste) &= ~PGSTE_UC_BIT;
        pte = *ptep;
        if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
                __ptep_ipte(addr, ptep);
                if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
                        pte_val(pte) |= _PAGE_PROTECT;
@@ -555,12 +642,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
        pgste_t old, new;
        pte_t *ptep;
 
-       down_read(&mm->mmap_sem);
        ptep = get_locked_pte(mm, addr, &ptl);
-       if (unlikely(!ptep)) {
-               up_read(&mm->mmap_sem);
+       if (unlikely(!ptep))
                return -EFAULT;
-       }
 
        new = old = pgste_get_lock(ptep);
        pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
@@ -587,45 +671,100 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 
        pgste_set_unlock(ptep, new);
        pte_unmap_unlock(ptep, ptl);
-       up_read(&mm->mmap_sem);
        return 0;
 }
 EXPORT_SYMBOL(set_guest_storage_key);
 
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+/**
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                              unsigned char key, unsigned char *oldkey,
+                              bool nq, bool mr, bool mc)
+{
+       unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+       int rc;
+
+       /* we can drop the pgste lock between getting and setting the key */
+       if (mr | mc) {
+               rc = get_guest_storage_key(current->mm, addr, &tmp);
+               if (rc)
+                       return rc;
+               if (oldkey)
+                       *oldkey = tmp;
+               if (!mr)
+                       mask |= _PAGE_REFERENCED;
+               if (!mc)
+                       mask |= _PAGE_CHANGED;
+               if (!((tmp ^ key) & mask))
+                       return 0;
+       }
+       rc = set_guest_storage_key(current->mm, addr, key, nq);
+       return rc < 0 ? rc : 1;
+}
+EXPORT_SYMBOL(cond_set_guest_storage_key);
+
+/**
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+ */
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 {
-       unsigned char key;
        spinlock_t *ptl;
-       pgste_t pgste;
+       pgste_t old, new;
        pte_t *ptep;
+       int cc = 0;
 
-       down_read(&mm->mmap_sem);
        ptep = get_locked_pte(mm, addr, &ptl);
-       if (unlikely(!ptep)) {
-               up_read(&mm->mmap_sem);
+       if (unlikely(!ptep))
                return -EFAULT;
-       }
-       pgste = pgste_get_lock(ptep);
 
-       if (pte_val(*ptep) & _PAGE_INVALID) {
-               key  = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
-               key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
-               key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
-               key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
-       } else {
-               key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+       new = old = pgste_get_lock(ptep);
+       /* Reset guest reference bit only */
+       pgste_val(new) &= ~PGSTE_GR_BIT;
 
-               /* Reflect guest's logical view, not physical */
-               if (pgste_val(pgste) & PGSTE_GR_BIT)
-                       key |= _PAGE_REFERENCED;
-               if (pgste_val(pgste) & PGSTE_GC_BIT)
-                       key |= _PAGE_CHANGED;
+       if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+               cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+               /* Merge real referenced bit into host-set */
+               pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
        }
+       /* Reflect guest's logical view, not physical */
+       cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+       /* Changing the guest storage key is considered a change of the page */
+       if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+               pgste_val(new) |= PGSTE_UC_BIT;
+
+       pgste_set_unlock(ptep, new);
+       pte_unmap_unlock(ptep, ptl);
+       return 0;
+}
+EXPORT_SYMBOL(reset_guest_reference_bit);
+
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                         unsigned char *key)
+{
+       spinlock_t *ptl;
+       pgste_t pgste;
+       pte_t *ptep;
 
+       ptep = get_locked_pte(mm, addr, &ptl);
+       if (unlikely(!ptep))
+               return -EFAULT;
+
+       pgste = pgste_get_lock(ptep);
+       *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+       if (!(pte_val(*ptep) & _PAGE_INVALID))
+               *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+       /* Reflect guest's logical view, not physical */
+       *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
        pgste_set_unlock(ptep, pgste);
        pte_unmap_unlock(ptep, ptl);
-       up_read(&mm->mmap_sem);
-       return key;
+       return 0;
 }
 EXPORT_SYMBOL(get_guest_storage_key);
 #endif
index 022d160..2303635 100644 (file)
@@ -55,9 +55,6 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 }
 
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
-void pci_resource_to_user(const struct pci_dev *dev, int bar,
-                         const struct resource *rsrc,
-                         resource_size_t *start, resource_size_t *end);
 #endif /* __KERNEL__ */
 
 #endif /* __SPARC64_PCI_H */
index c2b202d..9c1878f 100644 (file)
@@ -986,16 +986,18 @@ void pci_resource_to_user(const struct pci_dev *pdev, int bar,
                          const struct resource *rp, resource_size_t *start,
                          resource_size_t *end)
 {
-       struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller;
-       unsigned long offset;
-
-       if (rp->flags & IORESOURCE_IO)
-               offset = pbm->io_space.start;
-       else
-               offset = pbm->mem_space.start;
+       struct pci_bus_region region;
 
-       *start = rp->start - offset;
-       *end = rp->end - offset;
+       /*
+        * "User" addresses are shown in /sys/devices/pci.../.../resource
+        * and /proc/bus/pci/devices and used as mmap offsets for
+        * /proc/bus/pci/BB/DD.F files (see proc_bus_pci_mmap()).
+        *
+        * On sparc, these are PCI bus addresses, i.e., raw BAR values.
+        */
+       pcibios_resource_to_bus(pdev->bus, &region, (struct resource *) rp);
+       *start = region.start;
+       *end = region.end;
 }
 
 void pcibios_set_master(struct pci_dev *dev)
index cc00134..58650d0 100644 (file)
@@ -9,6 +9,7 @@ config UML
        select GENERIC_CPU_DEVICES
        select GENERIC_IO
        select GENERIC_CLOCKEVENTS
+       select HAVE_GCC_PLUGINS
        select TTY # Needed for line.c
 
 config MMU
index e3abe6f..0ca46ed 100644 (file)
@@ -78,8 +78,8 @@ include $(ARCH_DIR)/Makefile-os-$(OS)
 
 KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/include \
                   -I$(srctree)/$(HOST_DIR)/include/uapi \
-                  -I$(HOST_DIR)/include/generated \
-                  -I$(HOST_DIR)/include/generated/uapi
+                  -I$(objtree)/$(HOST_DIR)/include/generated \
+                  -I$(objtree)/$(HOST_DIR)/include/generated/uapi
 
 # -Derrno=kernel_errno - This turns all kernel references to errno into
 # kernel_errno to separate them from the libc errno.  This allows -fno-common
index d45fa5f..62137d1 100644 (file)
@@ -265,10 +265,8 @@ static int __init pci_common_init(void)
 
        pci_fixup_irqs(pci_common_swizzle, pci_puv3_map_irq);
 
-       if (!pci_has_flag(PCI_PROBE_ONLY)) {
-               pci_bus_size_bridges(puv3_bus);
-               pci_bus_assign_resources(puv3_bus);
-       }
+       pci_bus_size_bridges(puv3_bus);
+       pci_bus_assign_resources(puv3_bus);
        pci_bus_add_devices(puv3_bus);
        return 0;
 }
@@ -279,9 +277,6 @@ char * __init pcibios_setup(char *str)
        if (!strcmp(str, "debug")) {
                debug_pci = 1;
                return NULL;
-       } else if (!strcmp(str, "firmware")) {
-               pci_add_flags(PCI_PROBE_ONLY);
-               return NULL;
        }
        return str;
 }
index 2fa5585..3a9add5 100644 (file)
@@ -111,6 +111,7 @@ config X86
        select HAVE_FUNCTION_GRAPH_FP_TEST
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_FUNCTION_TRACER
+       select HAVE_GCC_PLUGINS
        select HAVE_GENERIC_DMA_COHERENT        if X86_32
        select HAVE_HW_BREAKPOINT
        select HAVE_IDE
index be8e688..12ea8f8 100644 (file)
@@ -96,7 +96,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
        $(call if_changed,zoffset)
 
 
-AFLAGS_header.o += -I$(obj)
+AFLAGS_header.o += -I$(objtree)/$(obj)
 $(obj)/header.o: $(obj)/zoffset.h
 
 LDFLAGS_setup.elf      := -T
index 6ba89a1..d540966 100644 (file)
@@ -75,7 +75,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
        -fno-omit-frame-pointer -foptimize-sibling-calls \
        -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 
-$(vobjs): KBUILD_CFLAGS += $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
 
 #
 # vDSO code runs in userspace and -pg doesn't help with profiling anyway.
@@ -145,6 +145,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
 KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
 KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
 KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
index 69e6286..33ae3a4 100644 (file)
@@ -35,8 +35,9 @@
 #include <asm/asm.h>
 #include <asm/kvm_page_track.h>
 
-#define KVM_MAX_VCPUS 255
-#define KVM_SOFT_MAX_VCPUS 160
+#define KVM_MAX_VCPUS 288
+#define KVM_SOFT_MAX_VCPUS 240
+#define KVM_MAX_VCPU_ID 1023
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 3
@@ -599,6 +600,7 @@ struct kvm_vcpu_arch {
        u64 mcg_cap;
        u64 mcg_status;
        u64 mcg_ctl;
+       u64 mcg_ext_ctl;
        u64 *mce_banks;
 
        /* Cache MMIO info */
@@ -682,9 +684,12 @@ struct kvm_arch_memory_slot {
 struct kvm_apic_map {
        struct rcu_head rcu;
        u8 mode;
-       struct kvm_lapic *phys_map[256];
-       /* first index is cluster id second is cpu id in a cluster */
-       struct kvm_lapic *logical_map[16][16];
+       u32 max_apic_id;
+       union {
+               struct kvm_lapic *xapic_flat_map[8];
+               struct kvm_lapic *xapic_cluster_map[16][4];
+       };
+       struct kvm_lapic *phys_map[];
 };
 
 /* Hyper-V emulation context */
@@ -779,6 +784,9 @@ struct kvm_arch {
        u32 ldr_mode;
        struct page *avic_logical_id_table_page;
        struct page *avic_physical_id_table_page;
+
+       bool x2apic_format;
+       bool x2apic_broadcast_quirk_disabled;
 };
 
 struct kvm_vm_stat {
@@ -1006,6 +1014,11 @@ struct kvm_x86_ops {
        int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
                              uint32_t guest_irq, bool set);
        void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+
+       int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+       void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+
+       void (*setup_mce)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -1026,7 +1039,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask);
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1077,6 +1090,10 @@ extern u32  kvm_max_guest_tsc_khz;
 extern u8   kvm_tsc_scaling_ratio_frac_bits;
 /* maximum allowed value of TSC scaling ratio */
 extern u64  kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64  kvm_default_tsc_scaling_ratio;
+
+extern u64 kvm_mce_cap_supported;
 
 enum emulation_result {
        EMULATE_DONE,         /* no further processing */
@@ -1352,7 +1369,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
                             struct kvm_vcpu **dest_vcpu);
 
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
                     struct kvm_lapic_irq *irq);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
index d0fe23e..14824fc 100644 (file)
@@ -193,7 +193,6 @@ struct __attribute__ ((__packed__)) vmcb {
        struct vmcb_save_area save;
 };
 
-#define SVM_CPUID_FEATURE_SHIFT 2
 #define SVM_CPUID_FUNC 0x8000000a
 
 #define SVM_VM_CR_SVM_DISABLE 4
index cce9ee6..0116b2e 100644 (file)
@@ -83,23 +83,19 @@ static inline void cpu_emergency_vmxoff(void)
  */
 static inline int cpu_has_svm(const char **msg)
 {
-       uint32_t eax, ebx, ecx, edx;
-
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
                if (msg)
                        *msg = "not amd";
                return 0;
        }
 
-       cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-       if (eax < SVM_CPUID_FUNC) {
+       if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) {
                if (msg)
                        *msg = "can't execute cpuid_8000000a";
                return 0;
        }
 
-       cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-       if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+       if (!boot_cpu_has(X86_FEATURE_SVM)) {
                if (msg)
                        *msg = "svm not available";
                return 0;
index 639a6e3..ab8e32f 100644 (file)
@@ -32,7 +32,6 @@ config KVM
        select HAVE_KVM_IRQ_BYPASS
        select HAVE_KVM_IRQ_ROUTING
        select HAVE_KVM_EVENTFD
-       select KVM_APIC_ARCHITECTURE
        select KVM_ASYNC_PF
        select USER_RETURN_NOTIFIER
        select KVM_MMIO
index a4bf5b4..5fb6c62 100644 (file)
@@ -645,7 +645,6 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
        .write    = speaker_ioport_write,
 };
 
-/* Caller must hold slots_lock */
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
        struct kvm_pit *pit;
@@ -690,6 +689,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
        kvm_pit_set_reinject(pit, true);
 
+       mutex_lock(&kvm->slots_lock);
        kvm_iodevice_init(&pit->dev, &pit_dev_ops);
        ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
                                      KVM_PIT_MEM_LENGTH, &pit->dev);
@@ -704,12 +704,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
                if (ret < 0)
                        goto fail_register_speaker;
        }
+       mutex_unlock(&kvm->slots_lock);
 
        return pit;
 
 fail_register_speaker:
        kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
 fail_register_pit:
+       mutex_unlock(&kvm->slots_lock);
        kvm_pit_set_reinject(pit, false);
        kthread_stop(pit->worker_task);
 fail_kthread:
index 95e0e64..b181426 100644 (file)
@@ -28,9 +28,7 @@
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
 #include <linux/stat.h>
-#include <linux/dmar.h>
 #include <linux/iommu.h>
-#include <linux/intel-iommu.h>
 #include "assigned-dev.h"
 
 static bool allow_unsafe_assigned_interrupts;
index dfb4c64..25810b1 100644 (file)
@@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        return r;
 }
 
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
                     struct kvm_lapic_irq *irq)
 {
-       trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+       trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ?
+                                            (u64)e->msi.address_hi << 32 : 0),
+                             e->msi.data);
 
        irq->dest_id = (e->msi.address_lo &
                        MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+       if (kvm->arch.x2apic_format)
+               irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
        irq->vector = (e->msi.data &
                        MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
        irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
@@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
 
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+               struct kvm_kernel_irq_routing_entry *e)
+{
+       return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
                struct kvm *kvm, int irq_source_id, int level, bool line_status)
 {
        struct kvm_lapic_irq irq;
 
+       if (kvm_msi_route_invalid(kvm, e))
+               return -EINVAL;
+
        if (!level)
                return -1;
 
-       kvm_set_msi_irq(e, &irq);
+       kvm_set_msi_irq(kvm, e, &irq);
 
        return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
 }
@@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
        if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
                return -EWOULDBLOCK;
 
-       kvm_set_msi_irq(e, &irq);
+       if (kvm_msi_route_invalid(kvm, e))
+               return -EINVAL;
+
+       kvm_set_msi_irq(kvm, e, &irq);
 
        if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
                return r;
@@ -248,7 +264,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
        return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
 {
        int r = -EINVAL;
@@ -285,6 +302,9 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
                e->msi.address_lo = ue->u.msi.address_lo;
                e->msi.address_hi = ue->u.msi.address_hi;
                e->msi.data = ue->u.msi.data;
+
+               if (kvm_msi_route_invalid(kvm, e))
+                       goto out;
                break;
        case KVM_IRQ_ROUTING_HV_SINT:
                e->set = kvm_hv_set_sint;
@@ -388,21 +408,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
                               kvm->arch.nr_reserved_ioapic_pins);
        for (i = 0; i < nr_ioapic_pins; ++i) {
                hlist_for_each_entry(entry, &table->map[i], link) {
-                       u32 dest_id, dest_mode;
-                       bool level;
+                       struct kvm_lapic_irq irq;
 
                        if (entry->type != KVM_IRQ_ROUTING_MSI)
                                continue;
-                       dest_id = (entry->msi.address_lo >> 12) & 0xff;
-                       dest_mode = (entry->msi.address_lo >> 2) & 0x1;
-                       level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
-                       if (level && kvm_apic_match_dest(vcpu, NULL, 0,
-                                               dest_id, dest_mode)) {
-                               u32 vector = entry->msi.data & 0xff;
-
-                               __set_bit(vector,
-                                         ioapic_handled_vectors);
-                       }
+
+                       kvm_set_msi_irq(vcpu->kvm, entry, &irq);
+
+                       if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+                                               irq.dest_id, irq.dest_mode))
+                               __set_bit(irq.vector, ioapic_handled_vectors);
                }
        }
        srcu_read_unlock(&kvm->irq_srcu, idx);
index 57549ed..730cf17 100644 (file)
@@ -115,26 +115,43 @@ static inline int apic_enabled(struct kvm_lapic *apic)
        (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
         APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
-/* The logical map is definitely wrong if we have multiple
- * modes at the same time.  (Physical map is always right.)
- */
-static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
-{
-       return !(map->mode & (map->mode - 1));
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+               u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+       switch (map->mode) {
+       case KVM_APIC_MODE_X2APIC: {
+               u32 offset = (dest_id >> 16) * 16;
+               u32 max_apic_id = map->max_apic_id;
+
+               if (offset <= max_apic_id) {
+                       u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+                       *cluster = &map->phys_map[offset];
+                       *mask = dest_id & (0xffff >> (16 - cluster_size));
+               } else {
+                       *mask = 0;
+               }
+
+               return true;
+               }
+       case KVM_APIC_MODE_XAPIC_FLAT:
+               *cluster = map->xapic_flat_map;
+               *mask = dest_id & 0xff;
+               return true;
+       case KVM_APIC_MODE_XAPIC_CLUSTER:
+               *cluster = map->xapic_cluster_map[dest_id >> 4];
+               *mask = dest_id & 0xf;
+               return true;
+       default:
+               /* Not optimized. */
+               return false;
+       }
 }
 
-static inline void
-apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+static void kvm_apic_map_free(struct rcu_head *rcu)
 {
-       unsigned lid_bits;
+       struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
 
-       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER !=  4);
-       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT    !=  8);
-       BUILD_BUG_ON(KVM_APIC_MODE_X2APIC        != 16);
-       lid_bits = map->mode;
-
-       *cid = dest_id >> lid_bits;
-       *lid = dest_id & ((1 << lid_bits) - 1);
+       kvfree(map);
 }
 
 static void recalculate_apic_map(struct kvm *kvm)
@@ -142,17 +159,26 @@ static void recalculate_apic_map(struct kvm *kvm)
        struct kvm_apic_map *new, *old = NULL;
        struct kvm_vcpu *vcpu;
        int i;
-
-       new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+       u32 max_id = 255;
 
        mutex_lock(&kvm->arch.apic_map_lock);
 
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               if (kvm_apic_present(vcpu))
+                       max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+
+       new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
+                          sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
+
        if (!new)
                goto out;
 
+       new->max_apic_id = max_id;
+
        kvm_for_each_vcpu(i, vcpu, kvm) {
                struct kvm_lapic *apic = vcpu->arch.apic;
-               u16 cid, lid;
+               struct kvm_lapic **cluster;
+               u16 mask;
                u32 ldr, aid;
 
                if (!kvm_apic_present(vcpu))
@@ -161,7 +187,7 @@ static void recalculate_apic_map(struct kvm *kvm)
                aid = kvm_apic_id(apic);
                ldr = kvm_lapic_get_reg(apic, APIC_LDR);
 
-               if (aid < ARRAY_SIZE(new->phys_map))
+               if (aid <= new->max_apic_id)
                        new->phys_map[aid] = apic;
 
                if (apic_x2apic_mode(apic)) {
@@ -174,13 +200,11 @@ static void recalculate_apic_map(struct kvm *kvm)
                                new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
                }
 
-               if (!kvm_apic_logical_map_valid(new))
+               if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
                        continue;
 
-               apic_logical_id(new, ldr, &cid, &lid);
-
-               if (lid && cid < ARRAY_SIZE(new->logical_map))
-                       new->logical_map[cid][ffs(lid) - 1] = apic;
+               if (mask)
+                       cluster[ffs(mask) - 1] = apic;
        }
 out:
        old = rcu_dereference_protected(kvm->arch.apic_map,
@@ -189,7 +213,7 @@ out:
        mutex_unlock(&kvm->arch.apic_map_lock);
 
        if (old)
-               kfree_rcu(old, rcu);
+               call_rcu(&old->rcu, kvm_apic_map_free);
 
        kvm_make_scan_ioapic_request(kvm);
 }
@@ -210,7 +234,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
        }
 }
 
-static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
 {
        kvm_lapic_set_reg(apic, APIC_ID, id << 24);
        recalculate_apic_map(apic->vcpu->kvm);
@@ -222,11 +246,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
        recalculate_apic_map(apic->vcpu->kvm);
 }
 
-static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 {
        u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
 
-       kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+       kvm_lapic_set_reg(apic, APIC_ID, id);
        kvm_lapic_set_reg(apic, APIC_LDR, ldr);
        recalculate_apic_map(apic->vcpu->kvm);
 }
@@ -599,17 +623,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
        }
 }
 
-/* KVM APIC implementation has two quirks
- *  - dest always begins at 0 while xAPIC MDA has offset 24,
- *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+/* The KVM local APIC implementation has two quirks:
+ *
+ *  - the xAPIC MDA stores the destination at bits 24-31, while this
+ *    is not true of struct kvm_lapic_irq's dest_id field.  This is
+ *    just a quirk in the API and is not problematic.
+ *
+ *  - in-kernel IOAPIC messages have to be delivered directly to
+ *    x2APIC, because the kernel does not support interrupt remapping.
+ *    In order to support broadcast without interrupt remapping, x2APIC
+ *    rewrites the destination of non-IPI messages from APIC_BROADCAST
+ *    to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API.  This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
  */
-static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
-                                              struct kvm_lapic *target)
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+               struct kvm_lapic *source, struct kvm_lapic *target)
 {
        bool ipi = source != NULL;
        bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
 
-       if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+       if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+           !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
                return X2APIC_BROADCAST;
 
        return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
@@ -619,7 +656,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, unsigned int dest, int dest_mode)
 {
        struct kvm_lapic *target = vcpu->arch.apic;
-       u32 mda = kvm_apic_mda(dest, source, target);
+       u32 mda = kvm_apic_mda(vcpu, dest, source, target);
 
        apic_debug("target %p, source %p, dest 0x%x, "
                   "dest_mode 0x%x, short_hand 0x%x\n",
@@ -671,102 +708,126 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
        }
 }
 
-bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+               struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
 {
-       struct kvm_apic_map *map;
-       unsigned long bitmap = 1;
-       struct kvm_lapic **dst;
-       int i;
-       bool ret, x2apic_ipi;
+       if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+               if ((irq->dest_id == APIC_BROADCAST &&
+                               map->mode != KVM_APIC_MODE_X2APIC))
+                       return true;
+               if (irq->dest_id == X2APIC_BROADCAST)
+                       return true;
+       } else {
+               bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+               if (irq->dest_id == (x2apic_ipi ?
+                                    X2APIC_BROADCAST : APIC_BROADCAST))
+                       return true;
+       }
 
-       *r = -1;
+       return false;
+}
 
-       if (irq->shorthand == APIC_DEST_SELF) {
-               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
-               return true;
-       }
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped.  In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+               struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+               struct kvm_apic_map *map, struct kvm_lapic ***dst,
+               unsigned long *bitmap)
+{
+       int i, lowest;
 
-       if (irq->shorthand)
+       if (irq->shorthand == APIC_DEST_SELF && src) {
+               *dst = src;
+               *bitmap = 1;
+               return true;
+       } else if (irq->shorthand)
                return false;
 
-       x2apic_ipi = src && apic_x2apic_mode(src);
-       if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+       if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
                return false;
 
-       ret = true;
-       rcu_read_lock();
-       map = rcu_dereference(kvm->arch.apic_map);
-
-       if (!map) {
-               ret = false;
-               goto out;
+       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+               if (irq->dest_id > map->max_apic_id) {
+                       *bitmap = 0;
+               } else {
+                       *dst = &map->phys_map[irq->dest_id];
+                       *bitmap = 1;
+               }
+               return true;
        }
 
-       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-               if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-                       goto out;
+       *bitmap = 0;
+       if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+                               (u16 *)bitmap))
+               return false;
 
-               dst = &map->phys_map[irq->dest_id];
-       } else {
-               u16 cid;
+       if (!kvm_lowest_prio_delivery(irq))
+               return true;
 
-               if (!kvm_apic_logical_map_valid(map)) {
-                       ret = false;
-                       goto out;
+       if (!kvm_vector_hashing_enabled()) {
+               lowest = -1;
+               for_each_set_bit(i, bitmap, 16) {
+                       if (!(*dst)[i])
+                               continue;
+                       if (lowest < 0)
+                               lowest = i;
+                       else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+                                               (*dst)[lowest]->vcpu) < 0)
+                               lowest = i;
                }
+       } else {
+               if (!*bitmap)
+                       return true;
 
-               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+               lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+                               bitmap, 16);
 
-               if (cid >= ARRAY_SIZE(map->logical_map))
-                       goto out;
+               if (!(*dst)[lowest]) {
+                       kvm_apic_disabled_lapic_found(kvm);
+                       *bitmap = 0;
+                       return true;
+               }
+       }
 
-               dst = map->logical_map[cid];
+       *bitmap = (lowest >= 0) ? 1 << lowest : 0;
 
-               if (!kvm_lowest_prio_delivery(irq))
-                       goto set_irq;
+       return true;
+}
 
-               if (!kvm_vector_hashing_enabled()) {
-                       int l = -1;
-                       for_each_set_bit(i, &bitmap, 16) {
-                               if (!dst[i])
-                                       continue;
-                               if (l < 0)
-                                       l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
-                                                       dst[l]->vcpu) < 0)
-                                       l = i;
-                       }
-                       bitmap = (l >= 0) ? 1 << l : 0;
-               } else {
-                       int idx;
-                       unsigned int dest_vcpus;
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+       struct kvm_apic_map *map;
+       unsigned long bitmap;
+       struct kvm_lapic **dst = NULL;
+       int i;
+       bool ret;
 
-                       dest_vcpus = hweight16(bitmap);
-                       if (dest_vcpus == 0)
-                               goto out;
+       *r = -1;
 
-                       idx = kvm_vector_to_index(irq->vector,
-                               dest_vcpus, &bitmap, 16);
+       if (irq->shorthand == APIC_DEST_SELF) {
+               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+               return true;
+       }
 
-                       if (!dst[idx]) {
-                               kvm_apic_disabled_lapic_found(kvm);
-                               goto out;
-                       }
+       rcu_read_lock();
+       map = rcu_dereference(kvm->arch.apic_map);
 
-                       bitmap = (idx >= 0) ? 1 << idx : 0;
+       ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+       if (ret)
+               for_each_set_bit(i, &bitmap, 16) {
+                       if (!dst[i])
+                               continue;
+                       if (*r < 0)
+                               *r = 0;
+                       *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
                }
-       }
 
-set_irq:
-       for_each_set_bit(i, &bitmap, 16) {
-               if (!dst[i])
-                       continue;
-               if (*r < 0)
-                       *r = 0;
-               *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
-       }
-out:
        rcu_read_unlock();
        return ret;
 }
@@ -789,8 +850,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu)
 {
        struct kvm_apic_map *map;
+       unsigned long bitmap;
+       struct kvm_lapic **dst = NULL;
        bool ret = false;
-       struct kvm_lapic *dst = NULL;
 
        if (irq->shorthand)
                return false;
@@ -798,69 +860,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
        rcu_read_lock();
        map = rcu_dereference(kvm->arch.apic_map);
 
-       if (!map)
-               goto out;
-
-       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-               if (irq->dest_id == 0xFF)
-                       goto out;
-
-               if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-                       goto out;
-
-               dst = map->phys_map[irq->dest_id];
-               if (dst && kvm_apic_present(dst->vcpu))
-                       *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
-       } else {
-               u16 cid;
-               unsigned long bitmap = 1;
-               int i, r = 0;
-
-               if (!kvm_apic_logical_map_valid(map))
-                       goto out;
-
-               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
-
-               if (cid >= ARRAY_SIZE(map->logical_map))
-                       goto out;
-
-               if (kvm_vector_hashing_enabled() &&
-                               kvm_lowest_prio_delivery(irq)) {
-                       int idx;
-                       unsigned int dest_vcpus;
+       if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+                       hweight16(bitmap) == 1) {
+               unsigned long i = find_first_bit(&bitmap, 16);
 
-                       dest_vcpus = hweight16(bitmap);
-                       if (dest_vcpus == 0)
-                               goto out;
-
-                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
-                                                 &bitmap, 16);
-
-                       dst = map->logical_map[cid][idx];
-                       if (!dst) {
-                               kvm_apic_disabled_lapic_found(kvm);
-                               goto out;
-                       }
-
-                       *dest_vcpu = dst->vcpu;
-               } else {
-                       for_each_set_bit(i, &bitmap, 16) {
-                               dst = map->logical_map[cid][i];
-                               if (++r == 2)
-                                       goto out;
-                       }
-
-                       if (dst && kvm_apic_present(dst->vcpu))
-                               *dest_vcpu = dst->vcpu;
-                       else
-                               goto out;
+               if (dst[i]) {
+                       *dest_vcpu = dst[i]->vcpu;
+                       ret = true;
                }
        }
 
-       ret = true;
-out:
        rcu_read_unlock();
        return ret;
 }
@@ -1127,12 +1136,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
                return 0;
 
        switch (offset) {
-       case APIC_ID:
-               if (apic_x2apic_mode(apic))
-                       val = kvm_apic_id(apic);
-               else
-                       val = kvm_apic_id(apic) << 24;
-               break;
        case APIC_ARBPRI:
                apic_debug("Access APIC ARBPRI register which is for P6\n");
                break;
@@ -1314,6 +1317,108 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
                        nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
 }
 
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+       u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+       u64 ns = 0;
+       ktime_t expire;
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+       unsigned long flags;
+       ktime_t now;
+
+       if (unlikely(!tscdeadline || !this_tsc_khz))
+               return;
+
+       local_irq_save(flags);
+
+       now = apic->lapic_timer.timer.base->get_time();
+       guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+       if (likely(tscdeadline > guest_tsc)) {
+               ns = (tscdeadline - guest_tsc) * 1000000ULL;
+               do_div(ns, this_tsc_khz);
+               expire = ktime_add_ns(now, ns);
+               expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+               hrtimer_start(&apic->lapic_timer.timer,
+                               expire, HRTIMER_MODE_ABS_PINNED);
+       } else
+               apic_timer_expired(apic);
+
+       local_irq_restore(flags);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+{
+       kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+       apic->lapic_timer.hv_timer_in_use = false;
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+       WARN_ON(swait_active(&vcpu->wq));
+       cancel_hv_tscdeadline(apic);
+       apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+{
+       u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+       if (atomic_read(&apic->lapic_timer.pending) ||
+               kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+               if (apic->lapic_timer.hv_timer_in_use)
+                       cancel_hv_tscdeadline(apic);
+       } else {
+               apic->lapic_timer.hv_timer_in_use = true;
+               hrtimer_cancel(&apic->lapic_timer.timer);
+
+               /* In case the sw timer triggered in the window */
+               if (atomic_read(&apic->lapic_timer.pending))
+                       cancel_hv_tscdeadline(apic);
+       }
+       trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+                       apic->lapic_timer.hv_timer_in_use);
+       return apic->lapic_timer.hv_timer_in_use;
+}
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       WARN_ON(apic->lapic_timer.hv_timer_in_use);
+
+       if (apic_lvtt_tscdeadline(apic))
+               start_hv_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       /* Possibly the TSC deadline timer is not enabled yet */
+       if (!apic->lapic_timer.hv_timer_in_use)
+               return;
+
+       cancel_hv_tscdeadline(apic);
+
+       if (atomic_read(&apic->lapic_timer.pending))
+               return;
+
+       start_sw_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
        ktime_t now;
@@ -1360,32 +1465,8 @@ static void start_apic_timer(struct kvm_lapic *apic)
                           ktime_to_ns(ktime_add_ns(now,
                                        apic->lapic_timer.period)));
        } else if (apic_lvtt_tscdeadline(apic)) {
-               /* lapic timer in tsc deadline mode */
-               u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
-               u64 ns = 0;
-               ktime_t expire;
-               struct kvm_vcpu *vcpu = apic->vcpu;
-               unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
-               unsigned long flags;
-
-               if (unlikely(!tscdeadline || !this_tsc_khz))
-                       return;
-
-               local_irq_save(flags);
-
-               now = apic->lapic_timer.timer.base->get_time();
-               guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-               if (likely(tscdeadline > guest_tsc)) {
-                       ns = (tscdeadline - guest_tsc) * 1000000ULL;
-                       do_div(ns, this_tsc_khz);
-                       expire = ktime_add_ns(now, ns);
-                       expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-                       hrtimer_start(&apic->lapic_timer.timer,
-                                     expire, HRTIMER_MODE_ABS_PINNED);
-               } else
-                       apic_timer_expired(apic);
-
-               local_irq_restore(flags);
+               if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+                       start_sw_tscdeadline(apic);
        }
 }
 
@@ -1413,7 +1494,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
        switch (reg) {
        case APIC_ID:           /* Local APIC ID */
                if (!apic_x2apic_mode(apic))
-                       kvm_apic_set_id(apic, val >> 24);
+                       kvm_apic_set_xapic_id(apic, val >> 24);
                else
                        ret = 1;
                break;
@@ -1674,9 +1755,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
        /* update jump label if enable bit changes */
        if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
-               if (value & MSR_IA32_APICBASE_ENABLE)
+               if (value & MSR_IA32_APICBASE_ENABLE) {
+                       kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
                        static_key_slow_dec_deferred(&apic_hw_disabled);
-               else
+               else
                        static_key_slow_inc(&apic_hw_disabled.key);
                recalculate_apic_map(vcpu->kvm);
        }
@@ -1716,8 +1798,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
        /* Stop the timer in case it's a reset to an active apic */
        hrtimer_cancel(&apic->lapic_timer.timer);
 
-       if (!init_event)
-               kvm_apic_set_id(apic, vcpu->vcpu_id);
+       if (!init_event) {
+               kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+                                        MSR_IA32_APICBASE_ENABLE);
+               kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+       }
        kvm_apic_set_version(apic->vcpu);
 
        for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -1856,9 +1941,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
         * thinking that APIC satet has changed.
         */
        vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
-       kvm_lapic_set_base(vcpu,
-                       APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
-
        static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
        kvm_lapic_reset(vcpu, false);
        kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -1938,17 +2020,48 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
        return vector;
 }
 
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
-               struct kvm_lapic_state *s)
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+               struct kvm_lapic_state *s, bool set)
+{
+       if (apic_x2apic_mode(vcpu->arch.apic)) {
+               u32 *id = (u32 *)(s->regs + APIC_ID);
+
+               if (vcpu->kvm->arch.x2apic_format) {
+                       if (*id != vcpu->vcpu_id)
+                               return -EINVAL;
+               } else {
+                       if (set)
+                               *id >>= 24;
+                       else
+                               *id <<= 24;
+               }
+       }
+
+       return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+       memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+       return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
+       int r;
+
 
        kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
        /* set SPIV separately to get count of SW disabled APICs right */
        apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+       r = kvm_apic_state_fixup(vcpu, s, true);
+       if (r)
+               return r;
        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-       /* call kvm_apic_set_id() to put apic into apic_map */
-       kvm_apic_set_id(apic, kvm_apic_id(apic));
+
+       recalculate_apic_map(vcpu->kvm);
        kvm_apic_set_version(vcpu);
 
        apic_update_ppr(apic);
@@ -1974,6 +2087,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
                kvm_rtc_eoi_tracking_restore_one(vcpu);
 
        vcpu->arch.apic_arb_prio = 0;
+
+       return 0;
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
index 891c6da..f60d01c 100644 (file)
@@ -20,6 +20,7 @@ struct kvm_timer {
        u64 tscdeadline;
        u64 expired_tscdeadline;
        atomic_t pending;                       /* accumulated triggered timers */
+       bool hv_timer_in_use;
 };
 
 struct kvm_lapic {
@@ -80,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
-               struct kvm_lapic_state *s);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -199,9 +200,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
        return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+static inline u32 kvm_apic_id(struct kvm_lapic *apic)
 {
-       return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+       /* To avoid a race between apic_base and following APIC_ID update when
+        * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
+        */
+       if (apic_x2apic_mode(apic))
+               return apic->vcpu->vcpu_id;
+
+       return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
 }
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
@@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu);
 int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
                        const unsigned long *bitmap, u32 bitmap_size);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 #endif
index 745a5f4..3d4cc8c 100644 (file)
@@ -176,6 +176,7 @@ static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_present_mask;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -283,13 +284,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 }
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask)
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
 {
        shadow_user_mask = user_mask;
        shadow_accessed_mask = accessed_mask;
        shadow_dirty_mask = dirty_mask;
        shadow_nx_mask = nx_mask;
        shadow_x_mask = x_mask;
+       shadow_present_mask = p_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -305,7 +307,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-       return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
+       return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -524,7 +526,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 }
 
 /* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changged.
+ * Update the state bits, it means the mapped pfn is not changed.
  *
  * Whenever we overwrite a writable spte with a read-only one we
  * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2246,10 +2248,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 {
        u64 spte;
 
-       BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
-                       VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+       BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
-       spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+       spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
               shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
 
        mmu_spte_set(sptep, spte);
@@ -2516,13 +2517,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
-       u64 spte;
+       u64 spte = 0;
        int ret = 0;
 
        if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
                return 0;
 
-       spte = PT_PRESENT_MASK;
+       /*
+        * For the EPT case, shadow_present_mask is 0 if hardware
+        * supports exec-only page table entries.  In that case,
+        * ACC_USER_MASK and shadow_user_mask are used to represent
+        * read access.  See FNAME(gpte_access) in paging_tmpl.h.
+        */
+       spte |= shadow_present_mask;
        if (!speculative)
                spte |= shadow_accessed_mask;
 
@@ -3190,7 +3197,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                MMU_WARN_ON(VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
                        pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
-                       if (!is_present_gpte(pdptr)) {
+                       if (!(pdptr & PT_PRESENT_MASK)) {
                                vcpu->arch.mmu.pae_root[i] = 0;
                                continue;
                        }
@@ -3915,9 +3922,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
                                 *   clearer.
                                 */
                                smap = cr4_smap && u && !uf && !ff;
-                       } else
-                               /* Not really needed: no U/S accesses on ept  */
-                               u = 1;
+                       }
 
                        fault = (ff && !x) || (uf && !u) || (wf && !w) ||
                                (smapf && smap);
index 66b33b9..ddc56e9 100644 (file)
@@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
        return kvm_mmu_load(vcpu);
 }
 
-static inline int is_present_gpte(unsigned long pte)
-{
-       return pte & PT_PRESENT_MASK;
-}
-
 /*
  * Currently, we have two sorts of write-protection, a) the first one
  * write-protects guest page to sync the guest modification, b) another one is
index bc019f7..a011054 100644 (file)
@@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
 static inline int FNAME(is_present_gpte)(unsigned long pte)
 {
 #if PTTYPE != PTTYPE_EPT
-       return is_present_gpte(pte);
+       return pte & PT_PRESENT_MASK;
 #else
        return pte & 7;
 #endif
@@ -181,13 +181,19 @@ no_present:
        return true;
 }
 
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
        unsigned access;
 #if PTTYPE == PTTYPE_EPT
        access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
                ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
-               ACC_USER_MASK;
+               ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
 #else
        BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
        BUILD_BUG_ON(ACC_EXEC_MASK != 1);
index ab38af4..9d4a850 100644 (file)
@@ -93,7 +93,7 @@ static unsigned intel_find_fixed_event(int idx)
        return intel_arch_events[fixed_pmc_events[idx]].event_type;
 }
 
-/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
 static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
 {
        struct kvm_pmu *pmu = pmc_to_pmu(pmc);
index 16ef31b..af523d8 100644 (file)
@@ -1577,7 +1577,7 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
        /*
-        * Any change of EFLAGS.VM is accompained by a reload of SS
+        * Any change of EFLAGS.VM is accompanied by a reload of SS
         * (caused by either a task switch or an inter-privilege IRET),
         * so we do not need to update the CPL here.
         */
@@ -4940,6 +4940,12 @@ out:
 static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
 {
        local_irq_enable();
+       /*
+        * We must have an instruction with interrupts enabled, so
+        * the timer interrupt isn't delayed by the interrupt shadow.
+        */
+       asm("nop");
+       local_irq_disable();
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
index 8de9250..0a6cc67 100644 (file)
@@ -1348,6 +1348,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
                  __entry->vec)
 );
 
+TRACE_EVENT(kvm_hv_timer_state,
+               TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+               TP_ARGS(vcpu_id, hv_timer_in_use),
+               TP_STRUCT__entry(
+                       __field(unsigned int, vcpu_id)
+                       __field(unsigned int, hv_timer_in_use)
+                       ),
+               TP_fast_assign(
+                       __entry->vcpu_id = vcpu_id;
+                       __entry->hv_timer_in_use = hv_timer_in_use;
+                       ),
+               TP_printk("vcpu_id %x hv_timer %x\n",
+                       __entry->vcpu_id,
+                       __entry->hv_timer_in_use)
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
index df07a0a..bc354f0 100644 (file)
@@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
 
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
+/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON                                           \
@@ -398,6 +405,12 @@ struct nested_vmx {
        /* The host-usable pointer to the above */
        struct page *current_vmcs12_page;
        struct vmcs12 *current_vmcs12;
+       /*
+        * Cache of the guest's VMCS, existing outside of guest memory.
+        * Loaded from guest memory during VMPTRLD. Flushed to guest
+        * memory during VMXOFF, VMCLEAR, VMPTRLD.
+        */
+       struct vmcs12 *cached_vmcs12;
        struct vmcs *current_shadow_vmcs;
        /*
         * Indicates if the shadow vmcs must be updated with the
@@ -421,7 +434,6 @@ struct nested_vmx {
        struct pi_desc *pi_desc;
        bool pi_pending;
        u16 posted_intr_nv;
-       u64 msr_ia32_feature_control;
 
        struct hrtimer preemption_timer;
        bool preemption_timer_expired;
@@ -597,11 +609,22 @@ struct vcpu_vmx {
 #define PML_ENTITY_NUM         512
        struct page *pml_pg;
 
+       /* apic deadline value in host tsc */
+       u64 hv_deadline_tsc;
+
        u64 current_tsc_ratio;
 
        bool guest_pkru_valid;
        u32 guest_pkru;
        u32 host_pkru;
+
+       /*
+        * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+        * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+        * in msr_ia32_feature_control_valid_bits.
+        */
+       u64 msr_ia32_feature_control;
+       u64 msr_ia32_feature_control_valid_bits;
 };
 
 enum segment_cache_field {
@@ -841,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {
-       return to_vmx(vcpu)->nested.current_vmcs12;
+       return to_vmx(vcpu)->nested.cached_vmcs12;
 }
 
 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
@@ -1056,6 +1079,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86  - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+       u32 eax = cpuid_eax(0x00000001), i;
+
+       /* Clear the reserved bits */
+       eax &= ~(0x3U << 14 | 0xfU << 28);
+       for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+               if (eax == vmx_preemption_cpu_tfms[i])
+                       return true;
+
+       return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+       return vmcs_config.pin_based_exec_ctrl &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
 static inline bool cpu_has_vmx_posted_intr(void)
 {
        return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -1603,6 +1678,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
        __vmcs_writel(field, __vmcs_readl(field) | mask);
 }
 
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+       vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
 {
        vmcs_write32(VM_ENTRY_CONTROLS, val);
@@ -1631,6 +1711,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
        vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
 }
 
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+       vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
 {
        vmcs_write32(VM_EXIT_CONTROLS, val);
@@ -2121,22 +2206,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
 
        if (!vmm_exclusive)
                kvm_cpu_vmxon(phys_addr);
-       else if (vmx->loaded_vmcs->cpu != cpu)
+       else if (!already_loaded)
                loaded_vmcs_clear(vmx->loaded_vmcs);
 
-       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
-               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
-               vmcs_load(vmx->loaded_vmcs->vmcs);
-       }
-
-       if (vmx->loaded_vmcs->cpu != cpu) {
-               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-               unsigned long sysenter_esp;
-
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       if (!already_loaded) {
                local_irq_disable();
                crash_disable_local_vmclear(cpu);
 
@@ -2151,6 +2228,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
                crash_enable_local_vmclear(cpu);
                local_irq_enable();
+       }
+
+       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+               vmcs_load(vmx->loaded_vmcs->vmcs);
+       }
+
+       if (!already_loaded) {
+               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+               unsigned long sysenter_esp;
+
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 
                /*
                 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -2716,6 +2805,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;
+               if (cpu_has_vmx_ept_execute_only())
+                       vmx->nested.nested_vmx_ept_caps |=
+                               VMX_EPT_EXECUTE_ONLY_BIT;
                vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * For nested guests, we don't do anything specific
@@ -2864,6 +2956,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        return 0;
 }
 
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+                                                uint64_t val)
+{
+       uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+       return !(val & ~valid_bits);
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -2905,10 +3005,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                break;
-       case MSR_IA32_FEATURE_CONTROL:
-               if (!nested_vmx_allowed(vcpu))
+       case MSR_IA32_MCG_EXT_CTL:
+               if (!msr_info->host_initiated &&
+                   !(to_vmx(vcpu)->msr_ia32_feature_control &
+                     FEATURE_CONTROL_LMCE))
                        return 1;
-               msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+               msr_info->data = vcpu->arch.mcg_ext_ctl;
+               break;
+       case MSR_IA32_FEATURE_CONTROL:
+               msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!nested_vmx_allowed(vcpu))
@@ -2998,12 +3103,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_TSC_ADJUST:
                ret = kvm_set_msr_common(vcpu, msr_info);
                break;
+       case MSR_IA32_MCG_EXT_CTL:
+               if ((!msr_info->host_initiated &&
+                    !(to_vmx(vcpu)->msr_ia32_feature_control &
+                      FEATURE_CONTROL_LMCE)) ||
+                   (data & ~MCG_EXT_CTL_LMCE_EN))
+                       return 1;
+               vcpu->arch.mcg_ext_ctl = data;
+               break;
        case MSR_IA32_FEATURE_CONTROL:
-               if (!nested_vmx_allowed(vcpu) ||
-                   (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+               if (!vmx_feature_control_msr_valid(vcpu, data) ||
+                   (to_vmx(vcpu)->msr_ia32_feature_control &
                     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
                        return 1;
-               vmx->nested.msr_ia32_feature_control = data;
+               vmx->msr_ia32_feature_control = data;
                if (msr_info->host_initiated && data == 0)
                        vmx_leave_nested(vcpu);
                break;
@@ -3297,25 +3410,27 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                      vmx_capability.ept, vmx_capability.vpid);
        }
 
-       min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+       min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
 #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-               VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+               VM_EXIT_CLEAR_BNDCFGS;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
 
        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+                PIN_BASED_VMX_PREEMPTION_TIMER;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control) < 0)
                return -EIO;
 
+       if (cpu_has_broken_vmx_preemption_timer())
+               _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        if (!(_cpu_based_2nd_exec_control &
-               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
-               !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
                _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
 
        min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3364,7 +3479,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 
        /*
         * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
-        * but due to arrata below it can't be used. Workaround is to use
+        * but due to errata below it can't be used. Workaround is to use
         * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
         *
         * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
@@ -4781,6 +4896,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 
        if (!kvm_vcpu_apicv_active(&vmx->vcpu))
                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+       /* Enable the preemption timer dynamically */
+       pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        return pin_based_exec_ctrl;
 }
 
@@ -4896,6 +5013,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        /* Control */
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       vmx->hv_deadline_tsc = -1;
 
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -6016,12 +6134,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        trace_kvm_page_fault(gpa, exit_qualification);
 
-       /* It is a write fault? */
-       error_code = exit_qualification & PFERR_WRITE_MASK;
+       /* it is a read fault? */
+       error_code = (exit_qualification << 2) & PFERR_USER_MASK;
+       /* it is a write fault? */
+       error_code |= exit_qualification & PFERR_WRITE_MASK;
        /* It is a fetch fault? */
        error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
        /* ept page table is present? */
-       error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
+       error_code |= (exit_qualification & 0x38) != 0;
 
        vcpu->arch.exit_qualification = exit_qualification;
 
@@ -6355,9 +6475,6 @@ static __init int hardware_setup(void)
        for (msr = 0x800; msr <= 0x8ff; msr++)
                vmx_disable_intercept_msr_read_x2apic(msr);
 
-       /* According SDM, in x2apic mode, the whole id reg is used.  But in
-        * KVM, it only use the highest eight bits. Need to intercept it */
-       vmx_enable_intercept_msr_read_x2apic(0x802);
        /* TMCCT */
        vmx_enable_intercept_msr_read_x2apic(0x839);
        /* TPR */
@@ -6368,10 +6485,12 @@ static __init int hardware_setup(void)
        vmx_disable_intercept_msr_write_x2apic(0x83f);
 
        if (enable_ept) {
-               kvm_mmu_set_mask_ptes(0ull,
+               kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
                        (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
                        (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-                       0ull, VMX_EPT_EXECUTABLE_MASK);
+                       0ull, VMX_EPT_EXECUTABLE_MASK,
+                       cpu_has_vmx_ept_execute_only() ?
+                                     0ull : VMX_EPT_READABLE_MASK);
                ept_set_mmio_spte_mask();
                kvm_enable_tdp();
        } else
@@ -6393,8 +6512,21 @@ static __init int hardware_setup(void)
                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
        }
 
+       if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+               u64 vmx_msr;
+
+               rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+               cpu_preemption_timer_multi =
+                        vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+       } else {
+               kvm_x86_ops->set_hv_timer = NULL;
+               kvm_x86_ops->cancel_hv_timer = NULL;
+       }
+
        kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
+       kvm_mce_cap_supported |= MCG_LMCE_P;
+
        return alloc_kvm_area();
 
 out8:
@@ -6862,16 +6994,22 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+       if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                        != VMXON_NEEDED_FEATURES) {
                kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
+       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+       if (!vmx->nested.cached_vmcs12)
+               return -ENOMEM;
+
        if (enable_shadow_vmcs) {
                shadow_vmcs = alloc_vmcs();
-               if (!shadow_vmcs)
+               if (!shadow_vmcs) {
+                       kfree(vmx->nested.cached_vmcs12);
                        return -ENOMEM;
+               }
                /* mark vmcs as shadow */
                shadow_vmcs->revision_id |= (1u << 31);
                /* init shadow vmcs */
@@ -6942,6 +7080,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
                vmcs_write64(VMCS_LINK_POINTER, -1ull);
        }
        vmx->nested.posted_intr_nv = -1;
+
+       /* Flush VMCS12 to guest memory */
+       memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
+              VMCS12_SIZE);
+
        kunmap(vmx->nested.current_vmcs12_page);
        nested_release_page(vmx->nested.current_vmcs12_page);
        vmx->nested.current_vmptr = -1ull;
@@ -6962,6 +7105,7 @@ static void free_nested(struct vcpu_vmx *vmx)
        nested_release_vmcs12(vmx);
        if (enable_shadow_vmcs)
                free_vmcs(vmx->nested.current_shadow_vmcs);
+       kfree(vmx->nested.cached_vmcs12);
        /* Unpin physical memory we referred to in current vmcs02 */
        if (vmx->nested.apic_access_page) {
                nested_release_page(vmx->nested.apic_access_page);
@@ -7365,6 +7509,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                vmx->nested.current_vmptr = vmptr;
                vmx->nested.current_vmcs12 = new_vmcs12;
                vmx->nested.current_vmcs12_page = page;
+               /*
+                * Load VMCS12 from guest memory since it is not already
+                * cached.
+                */
+               memcpy(vmx->nested.cached_vmcs12,
+                      vmx->nested.current_vmcs12, VMCS12_SIZE);
+
                if (enable_shadow_vmcs) {
                        vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
                                      SECONDARY_EXEC_SHADOW_VMCS);
@@ -7560,6 +7711,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       kvm_lapic_expired_hv_timer(vcpu);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7610,6 +7767,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_XSAVES]                  = handle_xsaves,
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
+       [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7918,6 +8076,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 * the XSS exit bitmap in vmcs12.
                 */
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return false;
        default:
                return true;
        }
@@ -8303,7 +8463,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
         * the next L2->L1 exit.
         */
        if (!is_guest_mode(vcpu) ||
-           !nested_cpu_has2(vmx->nested.current_vmcs12,
+           !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
                             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                vmcs_write64(APIC_ACCESS_ADDR, hpa);
 }
@@ -8436,7 +8596,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                        "push %[sp]\n\t"
 #endif
                        "pushf\n\t"
-                       "orl $0x200, (%%" _ASM_SP ")\n\t"
                        __ASM_SIZE(push) " $%c[cs]\n\t"
                        "call *%[entry]\n\t"
                        :
@@ -8449,8 +8608,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                        [ss]"i"(__KERNEL_DS),
                        [cs]"i"(__KERNEL_CS)
                        );
-       } else
-               local_irq_enable();
+       }
 }
 
 static bool vmx_has_high_real_mode_segbase(void)
@@ -8601,6 +8759,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                                        msrs[i].host);
 }
 
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl;
+       u32 delta_tsc;
+
+       if (vmx->hv_deadline_tsc == -1)
+               return;
+
+       tscl = rdtsc();
+       if (vmx->hv_deadline_tsc > tscl)
+               /* sure to be 32 bit only because checked on set_hv_timer */
+               delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+                       cpu_preemption_timer_multi);
+       else
+               delta_tsc = 0;
+
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8650,6 +8828,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
 
+       vmx_arm_hv_timer(vcpu);
+
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -8940,6 +9120,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        vmx->nested.current_vmptr = -1ull;
        vmx->nested.current_vmcs12 = NULL;
 
+       vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
        return &vmx->vcpu;
 
 free_vmcs:
@@ -9080,6 +9262,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
        if (cpu_has_secondary_exec_ctrls())
                vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+       if (nested_vmx_allowed(vcpu))
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+                       FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+       else
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+                       ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9636,9 +9825,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
        exec_control = vmcs12->pin_based_vm_exec_control;
-       exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+       /* Preemption timer setting is only taken from vmcs01.  */
        exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       if (vmx->hv_deadline_tsc == -1)
+               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 
+       /* Posted interrupts setting is only taken from vmcs12.  */
        if (nested_cpu_has_posted_intr(vmcs12)) {
                /*
                 * Note that we use L0's vector here and in
@@ -10556,8 +10750,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                                       vmcs12->vm_exit_intr_error_code,
                                       KVM_ISA_VMX);
 
-       vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
-       vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+       vm_entry_controls_reset_shadow(vmx);
+       vm_exit_controls_reset_shadow(vmx);
        vmx_segment_cache_clear(vmx);
 
        /* if no vmcs02 cache requested, remove the one we used */
@@ -10566,8 +10760,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        load_vmcs12_host_state(vcpu, vmcs12);
 
-       /* Update TSC_OFFSET if TSC was changed while L2 ran */
+       /* Update any VMCS fields that might have changed while L2 ran */
        vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+       if (vmx->hv_deadline_tsc == -1)
+               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       else
+               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                             PIN_BASED_VMX_PREEMPTION_TIMER);
 
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
        vmx->host_rsp = 0;
@@ -10647,6 +10847,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
        return X86EMUL_CONTINUE;
 }
 
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+                                 u64 divisor, u64 *result)
+{
+       u64 low = a << shift, high = a >> (64 - shift);
+
+       /* To avoid the overflow on divq */
+       if (high >= divisor)
+               return 1;
+
+       /* Low hold the result, high hold rem which is discarded */
+       asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+           "rm" (divisor), "0" (low), "1" (high));
+       *result = low;
+
+       return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl = rdtsc();
+       u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+       u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+
+       /* Convert to host delta tsc if tsc scaling is enabled */
+       if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+                       u64_shl_div_u64(delta_tsc,
+                               kvm_tsc_scaling_ratio_frac_bits,
+                               vcpu->arch.tsc_scaling_ratio,
+                               &delta_tsc))
+               return -ERANGE;
+
+       /*
+        * If the delta tsc can't fit in the 32 bit after the multi shift,
+        * we can't use the preemption timer.
+        * It's possible that it fits on later vmentries, but checking
+        * on every vmentry is costly so we just use an hrtimer.
+        */
+       if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+               return -ERANGE;
+
+       vmx->hv_deadline_tsc = tscl + delta_tsc;
+       vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+       return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       vmx->hv_deadline_tsc = -1;
+       vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
        if (ple_gap)
@@ -10691,7 +10949,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
  *   this case, return 1, otherwise, return 0.
  *
  */
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
        unsigned long flags;
        unsigned int dest;
@@ -10758,7 +11016,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+       if (pi_pre_block(vcpu))
+               return 1;
+
+       if (kvm_lapic_hv_timer_in_use(vcpu))
+               kvm_lapic_switch_to_sw_timer(vcpu);
+
+       return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
 {
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
        struct pi_desc old, new;
@@ -10800,6 +11069,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
        }
 }
 
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+       if (kvm_x86_ops->set_hv_timer)
+               kvm_lapic_switch_to_hv_timer(vcpu);
+
+       pi_post_block(vcpu);
+}
+
 /*
  * vmx_update_pi_irte - set IRTE for Posted-Interrupts
  *
@@ -10844,7 +11121,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                 * We will support full lowest-priority interrupt later.
                 */
 
-               kvm_set_msi_irq(e, &irq);
+               kvm_set_msi_irq(kvm, e, &irq);
                if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
                        /*
                         * Make sure the IRTE is in remapped mode if
@@ -10889,6 +11166,16 @@ out:
        return ret;
 }
 
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+                       FEATURE_CONTROL_LMCE;
+       else
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+                       ~FEATURE_CONTROL_LMCE;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -11013,6 +11300,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .pmu_ops = &intel_pmu_ops,
 
        .update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+       .set_hv_timer = vmx_set_hv_timer,
+       .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+       .setup_mce = vmx_setup_mce,
 };
 
 static int __init vmx_init(void)
index 9c496c7..19f9f9e 100644 (file)
@@ -71,7 +71,8 @@
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
 #define emul_to_vcpu(ctxt) \
        container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -90,8 +91,12 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@ -114,7 +119,8 @@ u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
 u64  __read_mostly kvm_max_tsc_scaling_ratio;
 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -538,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
                goto out;
        }
        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-               if (is_present_gpte(pdpte[i]) &&
+               if ((pdpte[i] & PT_PRESENT_MASK) &&
                    (pdpte[i] &
                     vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
                        ret = 0;
@@ -983,6 +989,7 @@ static u32 emulated_msrs[] = {
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
        MSR_IA32_MCG_CTL,
+       MSR_IA32_MCG_EXT_CTL,
        MSR_IA32_SMBASE,
 };
 
@@ -1162,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        int version;
        int r;
        struct pvclock_wall_clock wc;
-       struct timespec boot;
+       struct timespec64 boot;
 
        if (!wall_clock)
                return;
@@ -1185,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
         * wall clock specified here.  guest system time equals host
         * system time for us, thus we must fill in host boot time here.
         */
-       getboottime(&boot);
+       getboottime64(&boot);
 
        if (kvm->arch.kvmclock_offset) {
-               struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
-               boot = timespec_sub(boot, ts);
+               struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+               boot = timespec64_sub(boot, ts);
        }
-       wc.sec = boot.tv_sec;
+       wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
        wc.nsec = boot.tv_nsec;
        wc.version = version;
 
@@ -2616,6 +2623,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_TSC_CONTROL:
                r = kvm_has_tsc_control;
                break;
+       case KVM_CAP_X2APIC_API:
+               r = KVM_X2APIC_API_VALID_FLAGS;
+               break;
        default:
                r = 0;
                break;
@@ -2678,11 +2688,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
                break;
        }
        case KVM_X86_GET_MCE_CAP_SUPPORTED: {
-               u64 mce_cap;
-
-               mce_cap = KVM_MCE_CAP_SUPPORTED;
                r = -EFAULT;
-               if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+               if (copy_to_user(argp, &kvm_mce_cap_supported,
+                                sizeof(kvm_mce_cap_supported)))
                        goto out;
                r = 0;
                break;
@@ -2734,6 +2742,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                                rdtsc() - vcpu->arch.last_host_tsc;
                if (tsc_delta < 0)
                        mark_tsc_unstable("KVM discovered backwards TSC");
+
+               if (kvm_lapic_hv_timer_in_use(vcpu) &&
+                               kvm_x86_ops->set_hv_timer(vcpu,
+                                       kvm_get_lapic_tscdeadline_msr(vcpu)))
+                       kvm_lapic_switch_to_sw_timer(vcpu);
                if (check_tsc_unstable()) {
                        u64 offset = kvm_compute_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);
@@ -2767,15 +2780,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
        if (vcpu->arch.apicv_active)
                kvm_x86_ops->sync_pir_to_irr(vcpu);
 
-       memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-
-       return 0;
+       return kvm_apic_get_state(vcpu, s);
 }
 
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-       kvm_apic_post_state_restore(vcpu, s);
+       int r;
+
+       r = kvm_apic_set_state(vcpu, s);
+       if (r)
+               return r;
        update_cr8_intercept(vcpu);
 
        return 0;
@@ -2860,7 +2875,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        r = -EINVAL;
        if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
                goto out;
-       if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+       if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
                goto out;
        r = 0;
        vcpu->arch.mcg_cap = mcg_cap;
@@ -2870,6 +2885,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        /* Init IA32_MCi_CTL to all 1s */
        for (bank = 0; bank < bank_num; bank++)
                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+       if (kvm_x86_ops->setup_mce)
+               kvm_x86_ops->setup_mce(vcpu);
 out:
        return r;
 }
@@ -3768,7 +3786,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                r = -EEXIST;
                if (irqchip_in_kernel(kvm))
                        goto split_irqchip_unlock;
-               if (atomic_read(&kvm->online_vcpus))
+               if (kvm->created_vcpus)
                        goto split_irqchip_unlock;
                r = kvm_setup_empty_irq_routing(kvm);
                if (r)
@@ -3782,6 +3800,18 @@ split_irqchip_unlock:
                mutex_unlock(&kvm->lock);
                break;
        }
+       case KVM_CAP_X2APIC_API:
+               r = -EINVAL;
+               if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+                       break;
+
+               if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+                       kvm->arch.x2apic_format = true;
+               if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+                       kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -3833,7 +3863,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (kvm->arch.vpic)
                        goto create_irqchip_unlock;
                r = -EINVAL;
-               if (atomic_read(&kvm->online_vcpus))
+               if (kvm->created_vcpus)
                        goto create_irqchip_unlock;
                r = -ENOMEM;
                vpic = kvm_create_pic(kvm);
@@ -3873,7 +3903,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                                   sizeof(struct kvm_pit_config)))
                        goto out;
        create_pit:
-               mutex_lock(&kvm->slots_lock);
+               mutex_lock(&kvm->lock);
                r = -EEXIST;
                if (kvm->arch.vpit)
                        goto create_pit_unlock;
@@ -3882,7 +3912,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (kvm->arch.vpit)
                        r = 0;
        create_pit_unlock:
-               mutex_unlock(&kvm->slots_lock);
+               mutex_unlock(&kvm->lock);
                break;
        case KVM_GET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@ -3989,7 +4019,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
        case KVM_SET_BOOT_CPU_ID:
                r = 0;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) != 0)
+               if (kvm->created_vcpus)
                        r = -EBUSY;
                else
                        kvm->arch.bsp_vcpu_id = arg;
@@ -5297,13 +5327,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu)
                /* This is a good place to trace that we are exiting SMM.  */
                trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
 
-               if (unlikely(vcpu->arch.smi_pending)) {
-                       kvm_make_request(KVM_REQ_SMI, vcpu);
-                       vcpu->arch.smi_pending = 0;
-               } else {
-                       /* Process a latched INIT, if any.  */
-                       kvm_make_request(KVM_REQ_EVENT, vcpu);
-               }
+               /* Process a latched INIT or SMI, if any.  */
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 
        kvm_mmu_reset_context(vcpu);
@@ -5849,8 +5874,8 @@ int kvm_arch_init(void *opaque)
        kvm_x86_ops = ops;
 
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
+                       PT_DIRTY_MASK, PT64_NX_MASK, 0,
+                       PT_PRESENT_MASK);
        kvm_timer_init();
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6084,7 +6109,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
        }
 
        /* try to inject new event if pending */
-       if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+       if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+               vcpu->arch.smi_pending = false;
+               enter_smm(vcpu);
+       } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
                --vcpu->arch.nmi_pending;
                vcpu->arch.nmi_injected = true;
                kvm_x86_ops->set_nmi(vcpu);
@@ -6107,6 +6135,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
                        kvm_x86_ops->set_irq(vcpu);
                }
        }
+
        return 0;
 }
 
@@ -6130,7 +6159,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 #define put_smstate(type, buf, offset, val)                      \
        *(type *)((buf) + (offset) - 0x7e00) = val
 
-static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
 {
        u32 flags = 0;
        flags |= seg->g       << 23;
@@ -6144,7 +6173,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
        return flags;
 }
 
-static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 {
        struct kvm_segment seg;
        int offset;
@@ -6159,11 +6188,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 
        put_smstate(u32, buf, offset + 8, seg.base);
        put_smstate(u32, buf, offset + 4, seg.limit);
-       put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
 }
 
 #ifdef CONFIG_X86_64
-static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 {
        struct kvm_segment seg;
        int offset;
@@ -6172,7 +6201,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
        kvm_get_segment(vcpu, &seg, n);
        offset = 0x7e00 + n * 16;
 
-       flags = process_smi_get_segment_flags(&seg) >> 8;
+       flags = enter_smm_get_segment_flags(&seg) >> 8;
        put_smstate(u16, buf, offset, seg.selector);
        put_smstate(u16, buf, offset + 2, flags);
        put_smstate(u32, buf, offset + 4, seg.limit);
@@ -6180,7 +6209,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 }
 #endif
 
-static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 {
        struct desc_ptr dt;
        struct kvm_segment seg;
@@ -6204,13 +6233,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7fc4, seg.selector);
        put_smstate(u32, buf, 0x7f64, seg.base);
        put_smstate(u32, buf, 0x7f60, seg.limit);
-       put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
 
        kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
        put_smstate(u32, buf, 0x7fc0, seg.selector);
        put_smstate(u32, buf, 0x7f80, seg.base);
        put_smstate(u32, buf, 0x7f7c, seg.limit);
-       put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
 
        kvm_x86_ops->get_gdt(vcpu, &dt);
        put_smstate(u32, buf, 0x7f74, dt.address);
@@ -6221,7 +6250,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7f54, dt.size);
 
        for (i = 0; i < 6; i++)
-               process_smi_save_seg_32(vcpu, buf, i);
+               enter_smm_save_seg_32(vcpu, buf, i);
 
        put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
 
@@ -6230,7 +6259,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
 }
 
-static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 {
 #ifdef CONFIG_X86_64
        struct desc_ptr dt;
@@ -6262,7 +6291,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 
        kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
        put_smstate(u16, buf, 0x7e90, seg.selector);
-       put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+       put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
        put_smstate(u32, buf, 0x7e94, seg.limit);
        put_smstate(u64, buf, 0x7e98, seg.base);
 
@@ -6272,7 +6301,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 
        kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
        put_smstate(u16, buf, 0x7e70, seg.selector);
-       put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+       put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
        put_smstate(u32, buf, 0x7e74, seg.limit);
        put_smstate(u64, buf, 0x7e78, seg.base);
 
@@ -6281,31 +6310,26 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u64, buf, 0x7e68, dt.address);
 
        for (i = 0; i < 6; i++)
-               process_smi_save_seg_64(vcpu, buf, i);
+               enter_smm_save_seg_64(vcpu, buf, i);
 #else
        WARN_ON_ONCE(1);
 #endif
 }
 
-static void process_smi(struct kvm_vcpu *vcpu)
+static void enter_smm(struct kvm_vcpu *vcpu)
 {
        struct kvm_segment cs, ds;
        struct desc_ptr dt;
        char buf[512];
        u32 cr0;
 
-       if (is_smm(vcpu)) {
-               vcpu->arch.smi_pending = true;
-               return;
-       }
-
        trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
        vcpu->arch.hflags |= HF_SMM_MASK;
        memset(buf, 0, 512);
        if (guest_cpuid_has_longmode(vcpu))
-               process_smi_save_state_64(vcpu, buf);
+               enter_smm_save_state_64(vcpu, buf);
        else
-               process_smi_save_state_32(vcpu, buf);
+               enter_smm_save_state_32(vcpu, buf);
 
        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
 
@@ -6361,6 +6385,12 @@ static void process_smi(struct kvm_vcpu *vcpu)
        kvm_mmu_reset_context(vcpu);
 }
 
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.smi_pending = true;
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
        kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -6555,8 +6585,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
                if (inject_pending_event(vcpu, req_int_win) != 0)
                        req_immediate_exit = true;
-               /* enable NMI/IRQ window open exits if needed */
                else {
+                       /* Enable NMI/IRQ window open exits if needed.
+                        *
+                        * SMIs have two cases: 1) they can be nested, and
+                        * then there is nothing to do here because RSM will
+                        * cause a vmexit anyway; 2) or the SMI can be pending
+                        * because inject_pending_event has completed the
+                        * injection of an IRQ or NMI from the previous vmexit,
+                        * and then we request an immediate exit to inject the SMI.
+                        */
+                       if (vcpu->arch.smi_pending && !is_smm(vcpu))
+                               req_immediate_exit = true;
                        if (vcpu->arch.nmi_pending)
                                kvm_x86_ops->enable_nmi_window(vcpu);
                        if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -6607,12 +6647,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        kvm_load_guest_xcr0(vcpu);
 
-       if (req_immediate_exit)
+       if (req_immediate_exit) {
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
                smp_send_reschedule(vcpu->cpu);
+       }
 
        trace_kvm_entry(vcpu->vcpu_id);
        wait_lapic_expire(vcpu);
-       __kvm_guest_enter();
+       guest_enter_irqoff();
 
        if (unlikely(vcpu->arch.switch_db_regs)) {
                set_debugreg(0, 7);
@@ -6663,16 +6705,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        ++vcpu->stat.exits;
 
-       /*
-        * We must have an instruction between local_irq_enable() and
-        * kvm_guest_exit(), so the timer interrupt isn't delayed by
-        * the interrupt shadow.  The stat.exits increment will do nicely.
-        * But we need to prevent reordering, hence this barrier():
-        */
-       barrier();
-
-       kvm_guest_exit();
+       guest_exit_irqoff();
 
+       local_irq_enable();
        preempt_enable();
 
        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -7409,6 +7444,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        vcpu->arch.hflags = 0;
 
+       vcpu->arch.smi_pending = 0;
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
        vcpu->arch.nmi_injected = false;
@@ -7601,11 +7637,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
        return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
 }
 
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
-{
-       return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
-}
-
 struct static_key kvm_no_apic_vcpu __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
 
@@ -7872,7 +7903,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
-       kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
        kvm_mmu_uninit_vm(kvm);
 }
 
@@ -8380,7 +8411,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
        /*
         * When producer of consumer is unregistered, we change back to
         * remapped mode, so we can re-use the current implementation
-        * when the irq is masked/disabed or the consumer side (KVM
+        * when the irq is masked/disabled or the consumer side (KVM
         * int this case doesn't want to receive the interrupts.
        */
        ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
index 8196054..7b6a9d1 100644 (file)
@@ -133,7 +133,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev)
        if (pci_probe & PCI_NOASSIGN_BARS) {
                /*
                * If the BIOS did not assign the BAR, zero out the
-               * resource so the kernel doesn't attmept to assign
+               * resource so the kernel doesn't attempt to assign
                * it later on in pci_assign_unassigned_resources
                */
                for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
index 613cac7..e88b417 100644 (file)
@@ -119,10 +119,11 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 static void vmd_irq_enable(struct irq_data *data)
 {
        struct vmd_irq *vmdirq = data->chip_data;
+       unsigned long flags;
 
-       raw_spin_lock(&list_lock);
+       raw_spin_lock_irqsave(&list_lock, flags);
        list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
-       raw_spin_unlock(&list_lock);
+       raw_spin_unlock_irqrestore(&list_lock, flags);
 
        data->chip->irq_unmask(data);
 }
@@ -130,12 +131,14 @@ static void vmd_irq_enable(struct irq_data *data)
 static void vmd_irq_disable(struct irq_data *data)
 {
        struct vmd_irq *vmdirq = data->chip_data;
+       unsigned long flags;
 
        data->chip->irq_mask(data);
 
-       raw_spin_lock(&list_lock);
+       raw_spin_lock_irqsave(&list_lock, flags);
        list_del_rcu(&vmdirq->node);
-       raw_spin_unlock(&list_lock);
+       INIT_LIST_HEAD_RCU(&vmdirq->node);
+       raw_spin_unlock_irqrestore(&list_lock, flags);
 }
 
 /*
@@ -166,16 +169,20 @@ static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
  * XXX: We can be even smarter selecting the best IRQ once we solve the
  * affinity problem.
  */
-static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd)
+static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
 {
-       int i, best = 0;
+       int i, best = 1;
+       unsigned long flags;
 
-       raw_spin_lock(&list_lock);
+       if (!desc->msi_attrib.is_msix || vmd->msix_count == 1)
+               return &vmd->irqs[0];
+
+       raw_spin_lock_irqsave(&list_lock, flags);
        for (i = 1; i < vmd->msix_count; i++)
                if (vmd->irqs[i].count < vmd->irqs[best].count)
                        best = i;
        vmd->irqs[best].count++;
-       raw_spin_unlock(&list_lock);
+       raw_spin_unlock_irqrestore(&list_lock, flags);
 
        return &vmd->irqs[best];
 }
@@ -184,14 +191,15 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
                        unsigned int virq, irq_hw_number_t hwirq,
                        msi_alloc_info_t *arg)
 {
-       struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(arg->desc)->bus);
+       struct msi_desc *desc = arg->desc;
+       struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
        struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
 
        if (!vmdirq)
                return -ENOMEM;
 
        INIT_LIST_HEAD(&vmdirq->node);
-       vmdirq->irq = vmd_next_irq(vmd);
+       vmdirq->irq = vmd_next_irq(vmd, desc);
        vmdirq->virq = virq;
 
        irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip,
@@ -203,11 +211,12 @@ static void vmd_msi_free(struct irq_domain *domain,
                        struct msi_domain_info *info, unsigned int virq)
 {
        struct vmd_irq *vmdirq = irq_get_chip_data(virq);
+       unsigned long flags;
 
        /* XXX: Potential optimization to rebalance */
-       raw_spin_lock(&list_lock);
+       raw_spin_lock_irqsave(&list_lock, flags);
        vmdirq->irq->count--;
-       raw_spin_unlock(&list_lock);
+       raw_spin_unlock_irqrestore(&list_lock, flags);
 
        kfree_rcu(vmdirq, rcu);
 }
@@ -261,7 +270,7 @@ static struct device *to_vmd_dev(struct device *dev)
 
 static struct dma_map_ops *vmd_dma_ops(struct device *dev)
 {
-       return to_vmd_dev(dev)->archdata.dma_ops;
+       return get_dma_ops(to_vmd_dev(dev));
 }
 
 static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
@@ -367,7 +376,7 @@ static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
 {
        struct dma_domain *domain = &vmd->dma_domain;
 
-       if (vmd->dev->dev.archdata.dma_ops)
+       if (get_dma_ops(&vmd->dev->dev))
                del_dma_domain(domain);
 }
 
@@ -379,7 +388,7 @@ static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
 
 static void vmd_setup_dma_ops(struct vmd_dev *vmd)
 {
-       const struct dma_map_ops *source = vmd->dev->dev.archdata.dma_ops;
+       const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev);
        struct dma_map_ops *dest = &vmd->dma_ops;
        struct dma_domain *domain = &vmd->dma_domain;
 
@@ -594,7 +603,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
        sd->node = pcibus_to_node(vmd->dev->bus);
 
        vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info,
-                                                   NULL);
+                                                   x86_vector_domain);
        if (!vmd->irq_domain)
                return -ENODEV;
 
index 12734a9..ac58c16 100644 (file)
@@ -8,6 +8,8 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
 LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
 targets += purgatory.ro
 
+KCOV_INSTRUMENT := n
+
 # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
 # in turn leaves some undefined symbols like __fentry__ in purgatory and not
 # sure how to relocate those. Like kexec-tools, use custom flags.
index c556c5a..25012ab 100644 (file)
@@ -48,7 +48,7 @@ targets += realmode.lds
 $(obj)/realmode.lds: $(obj)/pasyms.h
 
 LDFLAGS_realmode.elf := --emit-relocs -T
-CPPFLAGS_realmode.lds += -P -C -I$(obj)
+CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj)
 
 targets += realmode.elf
 $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
index aebd944..445ce28 100644 (file)
@@ -221,6 +221,9 @@ config ACPI_PROCESSOR_IDLE
        bool
        select CPU_IDLE
 
+config ACPI_MCFG
+       bool
+
 config ACPI_CPPC_LIB
        bool
        depends on ACPI_PROCESSOR
index 35a6ccb..5ae9d85 100644 (file)
@@ -40,6 +40,7 @@ acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o
 acpi-y                         += ec.o
 acpi-$(CONFIG_ACPI_DOCK)       += dock.o
 acpi-y                         += pci_root.o pci_link.o pci_irq.o
+obj-$(CONFIG_ACPI_MCFG)                += pci_mcfg.o
 acpi-y                         += acpi_lpss.o acpi_apd.o
 acpi-y                         += acpi_platform.o
 acpi-y                         += acpi_pnp.o
diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
new file mode 100644 (file)
index 0000000..b5b376e
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 Broadcom
+ *     Author: Jayachandran C <jchandra@broadcom.com>
+ * Copyright (C) 2016 Semihalf
+ *     Author: Tomasz Nowicki <tn@semihalf.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+#define pr_fmt(fmt) "ACPI: " fmt
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci-acpi.h>
+
+/* Structure to hold entries from the MCFG table */
+struct mcfg_entry {
+       struct list_head        list;
+       phys_addr_t             addr;
+       u16                     segment;
+       u8                      bus_start;
+       u8                      bus_end;
+};
+
+/* List to save MCFG entries */
+static LIST_HEAD(pci_mcfg_list);
+
+phys_addr_t pci_mcfg_lookup(u16 seg, struct resource *bus_res)
+{
+       struct mcfg_entry *e;
+
+       /*
+        * We expect exact match, unless MCFG entry end bus covers more than
+        * specified by caller.
+        */
+       list_for_each_entry(e, &pci_mcfg_list, list) {
+               if (e->segment == seg && e->bus_start == bus_res->start &&
+                   e->bus_end >= bus_res->end)
+                       return e->addr;
+       }
+
+       return 0;
+}
+
+static __init int pci_mcfg_parse(struct acpi_table_header *header)
+{
+       struct acpi_table_mcfg *mcfg;
+       struct acpi_mcfg_allocation *mptr;
+       struct mcfg_entry *e, *arr;
+       int i, n;
+
+       if (header->length < sizeof(struct acpi_table_mcfg))
+               return -EINVAL;
+
+       n = (header->length - sizeof(struct acpi_table_mcfg)) /
+                                       sizeof(struct acpi_mcfg_allocation);
+       mcfg = (struct acpi_table_mcfg *)header;
+       mptr = (struct acpi_mcfg_allocation *) &mcfg[1];
+
+       arr = kcalloc(n, sizeof(*arr), GFP_KERNEL);
+       if (!arr)
+               return -ENOMEM;
+
+       for (i = 0, e = arr; i < n; i++, mptr++, e++) {
+               e->segment = mptr->pci_segment;
+               e->addr =  mptr->address;
+               e->bus_start = mptr->start_bus_number;
+               e->bus_end = mptr->end_bus_number;
+               list_add(&e->list, &pci_mcfg_list);
+       }
+
+       pr_info("MCFG table detected, %d entries\n", n);
+       return 0;
+}
+
+/* Interface called by ACPI - parse and save MCFG table */
+void __init pci_mmcfg_late_init(void)
+{
+       int err = acpi_table_parse(ACPI_SIG_MCFG, pci_mcfg_parse);
+       if (err)
+               pr_err("Failed to parse MCFG (%d)\n", err);
+}
index ae3fe4e..d144168 100644 (file)
@@ -720,6 +720,36 @@ next:
        }
 }
 
+static void acpi_pci_root_remap_iospace(struct resource_entry *entry)
+{
+#ifdef PCI_IOBASE
+       struct resource *res = entry->res;
+       resource_size_t cpu_addr = res->start;
+       resource_size_t pci_addr = cpu_addr - entry->offset;
+       resource_size_t length = resource_size(res);
+       unsigned long port;
+
+       if (pci_register_io_range(cpu_addr, length))
+               goto err;
+
+       port = pci_address_to_pio(cpu_addr);
+       if (port == (unsigned long)-1)
+               goto err;
+
+       res->start = port;
+       res->end = port + length - 1;
+       entry->offset = port - pci_addr;
+
+       if (pci_remap_iospace(res, cpu_addr) < 0)
+               goto err;
+
+       pr_info("Remapped I/O %pa to %pR\n", &cpu_addr, res);
+       return;
+err:
+       res->flags |= IORESOURCE_DISABLED;
+#endif
+}
+
 int acpi_pci_probe_root_resources(struct acpi_pci_root_info *info)
 {
        int ret;
@@ -740,6 +770,9 @@ int acpi_pci_probe_root_resources(struct acpi_pci_root_info *info)
                        "no IO and memory resources present in _CRS\n");
        else {
                resource_list_for_each_entry_safe(entry, tmp, list) {
+                       if (entry->res->flags & IORESOURCE_IO)
+                               acpi_pci_root_remap_iospace(entry);
+
                        if (entry->res->flags & IORESOURCE_DISABLED)
                                resource_list_destroy_entry(entry);
                        else
@@ -811,6 +844,8 @@ static void acpi_pci_root_release_info(struct pci_host_bridge *bridge)
 
        resource_list_for_each_entry(entry, &bridge->windows) {
                res = entry->res;
+               if (res->flags & IORESOURCE_IO)
+                       pci_unmap_iospace(res);
                if (res->parent &&
                    (res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
                        release_resource(res);
index 4506620..1a04af6 100644 (file)
@@ -1937,7 +1937,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
 
-       osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+       osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
        if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
                             obj_request->object_name))
                goto fail;
@@ -1991,7 +1991,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
 
-       osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+       osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
        if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
                             obj_request->object_name))
                goto fail;
@@ -3995,10 +3995,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 
        /* Initialize the layout used for all rbd requests */
 
-       rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-       rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
-       rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-       rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+       rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
+       rbd_dev->layout.stripe_count = 1;
+       rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
+       rbd_dev->layout.pool_id = spec->pool_id;
+       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
 
        /*
         * If this is a mapping rbd_dev (as opposed to a parent one),
@@ -5187,7 +5188,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
-       rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+       rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
        if (rbd_dev->image_format == 1)
                ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
                                       spec->image_name, RBD_SUFFIX);
index 8f11b87..eae5ef9 100644 (file)
@@ -860,3 +860,35 @@ void drm_dp_aux_unregister(struct drm_dp_aux *aux)
        i2c_del_adapter(&aux->ddc);
 }
 EXPORT_SYMBOL(drm_dp_aux_unregister);
+
+#define PSR_SETUP_TIME(x) [DP_PSR_SETUP_TIME_ ## x >> DP_PSR_SETUP_TIME_SHIFT] = (x)
+
+/**
+ * drm_dp_psr_setup_time() - PSR setup in time usec
+ * @psr_cap: PSR capabilities from DPCD
+ *
+ * Returns:
+ * PSR setup time for the panel in microseconds,  negative
+ * error code on failure.
+ */
+int drm_dp_psr_setup_time(const u8 psr_cap[EDP_PSR_RECEIVER_CAP_SIZE])
+{
+       static const u16 psr_setup_time_us[] = {
+               PSR_SETUP_TIME(330),
+               PSR_SETUP_TIME(275),
+               PSR_SETUP_TIME(165),
+               PSR_SETUP_TIME(110),
+               PSR_SETUP_TIME(55),
+               PSR_SETUP_TIME(0),
+       };
+       int i;
+
+       i = (psr_cap[1] & DP_PSR_SETUP_TIME_MASK) >> DP_PSR_SETUP_TIME_SHIFT;
+       if (i >= ARRAY_SIZE(psr_setup_time_us))
+               return -EINVAL;
+
+       return psr_setup_time_us[i];
+}
+EXPORT_SYMBOL(drm_dp_psr_setup_time);
+
+#undef PSR_SETUP_TIME
index 3329fc6..cc937a1 100644 (file)
@@ -1730,6 +1730,8 @@ bool intel_sdvo_init(struct drm_device *dev,
 
 
 /* intel_sprite.c */
+int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
+                            int usecs);
 int intel_plane_init(struct drm_device *dev, enum pipe pipe, int plane);
 int intel_sprite_set_colorkey(struct drm_device *dev, void *data,
                              struct drm_file *file_priv);
index 68bd0bb..2b0d1ba 100644 (file)
@@ -327,6 +327,9 @@ static bool intel_psr_match_conditions(struct intel_dp *intel_dp)
        struct drm_i915_private *dev_priv = to_i915(dev);
        struct drm_crtc *crtc = dig_port->base.base.crtc;
        struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+       const struct drm_display_mode *adjusted_mode =
+               &intel_crtc->config->base.adjusted_mode;
+       int psr_setup_time;
 
        lockdep_assert_held(&dev_priv->psr.lock);
        WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex));
@@ -365,11 +368,25 @@ static bool intel_psr_match_conditions(struct intel_dp *intel_dp)
        }
 
        if (IS_HASWELL(dev) &&
-           intel_crtc->config->base.adjusted_mode.flags & DRM_MODE_FLAG_INTERLACE) {
+           adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) {
                DRM_DEBUG_KMS("PSR condition failed: Interlaced is Enabled\n");
                return false;
        }
 
+       psr_setup_time = drm_dp_psr_setup_time(intel_dp->psr_dpcd);
+       if (psr_setup_time < 0) {
+               DRM_DEBUG_KMS("PSR condition failed: Invalid PSR setup time (0x%02x)\n",
+                             intel_dp->psr_dpcd[1]);
+               return false;
+       }
+
+       if (intel_usecs_to_scanlines(adjusted_mode, psr_setup_time) >
+           adjusted_mode->crtc_vtotal - adjusted_mode->crtc_vdisplay - 1) {
+               DRM_DEBUG_KMS("PSR condition failed: PSR setup time (%d us) too long\n",
+                             psr_setup_time);
+               return false;
+       }
+
        dev_priv->psr.source_ok = true;
        return true;
 }
index 0de935a..7c08e4f 100644 (file)
@@ -53,8 +53,8 @@ format_is_yuv(uint32_t format)
        }
 }
 
-static int usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
-                             int usecs)
+int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
+                            int usecs)
 {
        /* paranoia */
        if (!adjusted_mode->crtc_htotal)
@@ -91,7 +91,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
                vblank_start = DIV_ROUND_UP(vblank_start, 2);
 
        /* FIXME needs to be calibrated sensibly */
-       min = vblank_start - usecs_to_scanlines(adjusted_mode, 100);
+       min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
        max = vblank_start - 1;
 
        local_irq_disable();
index 5495a5b..7f87289 100644 (file)
@@ -21,9 +21,9 @@ config ARM_GIC_MAX_NR
 
 config ARM_GIC_V2M
        bool
-       depends on ARM_GIC
-       depends on PCI && PCI_MSI
-       select PCI_MSI_IRQ_DOMAIN
+       depends on PCI
+       select ARM_GIC
+       select PCI_MSI
 
 config GIC_NON_BANKED
        bool
@@ -37,7 +37,8 @@ config ARM_GIC_V3
 
 config ARM_GIC_V3_ITS
        bool
-       select PCI_MSI_IRQ_DOMAIN
+       depends on PCI
+       depends on PCI_MSI
 
 config ARM_NVIC
        bool
@@ -62,13 +63,13 @@ config ARM_VIC_NR
 config ARMADA_370_XP_IRQ
        bool
        select GENERIC_IRQ_CHIP
-       select PCI_MSI_IRQ_DOMAIN if PCI_MSI
+       select PCI_MSI if PCI
 
 config ALPINE_MSI
        bool
-       depends on PCI && PCI_MSI
+       depends on PCI
+       select PCI_MSI
        select GENERIC_IRQ_CHIP
-       select PCI_MSI_IRQ_DOMAIN
 
 config ATMEL_AIC_IRQ
        bool
@@ -117,7 +118,6 @@ config HISILICON_IRQ_MBIGEN
        bool
        select ARM_GIC_V3
        select ARM_GIC_V3_ITS
-       select GENERIC_MSI_IRQ_DOMAIN
 
 config IMGPDC_IRQ
        bool
@@ -250,12 +250,10 @@ config IRQ_MXS
 
 config MVEBU_ODMI
        bool
-       select GENERIC_MSI_IRQ_DOMAIN
 
 config LS_SCFG_MSI
        def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE
        depends on PCI && PCI_MSI
-       select PCI_MSI_IRQ_DOMAIN
 
 config PARTITION_PERCPU
        bool
index 1337123..4b4c0c3 100644 (file)
@@ -115,7 +115,7 @@ config FSL_CORENET_CF
 
 config FSL_IFC
        bool
-       depends on FSL_SOC
+       depends on FSL_SOC || ARCH_LAYERSCAPE
 
 config JZ4780_NEMC
        bool "Ingenic JZ4780 SoC NEMC driver"
index 904b4af..1b182b1 100644 (file)
@@ -31,7 +31,9 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/fsl_ifc.h>
-#include <asm/prom.h>
+#include <linux/irqdomain.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
 
 struct fsl_ifc_ctrl *fsl_ifc_ctrl_dev;
 EXPORT_SYMBOL(fsl_ifc_ctrl_dev);
index 4cf8f82..a70b853 100644 (file)
@@ -182,7 +182,7 @@ static void genwqe_dev_free(struct genwqe_dev *cd)
  */
 static int genwqe_bus_reset(struct genwqe_dev *cd)
 {
-       int bars, rc = 0;
+       int rc = 0;
        struct pci_dev *pci_dev = cd->pci_dev;
        void __iomem *mmio;
 
@@ -193,8 +193,7 @@ static int genwqe_bus_reset(struct genwqe_dev *cd)
        cd->mmio = NULL;
        pci_iounmap(pci_dev, mmio);
 
-       bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
-       pci_release_selected_regions(pci_dev, bars);
+       pci_release_mem_regions(pci_dev);
 
        /*
         * Firmware/BIOS might change memory mapping during bus reset.
@@ -218,7 +217,7 @@ static int genwqe_bus_reset(struct genwqe_dev *cd)
                            GENWQE_INJECT_GFIR_FATAL |
                            GENWQE_INJECT_GFIR_INFO);
 
-       rc = pci_request_selected_regions(pci_dev, bars, genwqe_driver_name);
+       rc = pci_request_mem_regions(pci_dev, genwqe_driver_name);
        if (rc) {
                dev_err(&pci_dev->dev,
                        "[%s] err: request bars failed (%d)\n", __func__, rc);
@@ -1068,10 +1067,9 @@ static int genwqe_health_check_stop(struct genwqe_dev *cd)
  */
 static int genwqe_pci_setup(struct genwqe_dev *cd)
 {
-       int err, bars;
+       int err;
        struct pci_dev *pci_dev = cd->pci_dev;
 
-       bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
        err = pci_enable_device_mem(pci_dev);
        if (err) {
                dev_err(&pci_dev->dev,
@@ -1080,7 +1078,7 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
        }
 
        /* Reserve PCI I/O and memory resources */
-       err = pci_request_selected_regions(pci_dev, bars, genwqe_driver_name);
+       err = pci_request_mem_regions(pci_dev, genwqe_driver_name);
        if (err) {
                dev_err(&pci_dev->dev,
                        "[%s] err: request bars failed (%d)\n", __func__, err);
@@ -1142,7 +1140,7 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
  out_iounmap:
        pci_iounmap(pci_dev, cd->mmio);
  out_release_resources:
-       pci_release_selected_regions(pci_dev, bars);
+       pci_release_mem_regions(pci_dev);
  err_disable_device:
        pci_disable_device(pci_dev);
  err_out:
@@ -1154,14 +1152,12 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
  */
 static void genwqe_pci_remove(struct genwqe_dev *cd)
 {
-       int bars;
        struct pci_dev *pci_dev = cd->pci_dev;
 
        if (cd->mmio)
                pci_iounmap(pci_dev, cd->mmio);
 
-       bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
-       pci_release_selected_regions(pci_dev, bars);
+       pci_release_mem_regions(pci_dev);
        pci_disable_device(pci_dev);
 }
 
index 9a1a6ff..94d3eb4 100644 (file)
@@ -416,7 +416,7 @@ static int cfi_staa_read (struct mtd_info *mtd, loff_t from, size_t len, size_t
        return ret;
 }
 
-static inline int do_write_buffer(struct map_info *map, struct flchip *chip,
+static int do_write_buffer(struct map_info *map, struct flchip *chip,
                                  unsigned long adr, const u_char *buf, int len)
 {
        struct cfi_private *cfi = map->fldrv_priv;
index 64a2485..58329d2 100644 (file)
@@ -113,12 +113,12 @@ config MTD_SST25L
          if you want to specify device partitioning.
 
 config MTD_BCM47XXSFLASH
-       tristate "R/O support for serial flash on BCMA bus"
+       tristate "Support for serial flash on BCMA bus"
        depends on BCMA_SFLASH && (MIPS || ARM)
        help
          BCMA bus can have various flash memories attached, they are
          registered by bcma as platform devices. This enables driver for
-         serial flash memories (only read-only mode is implemented).
+         serial flash memories.
 
 config MTD_SLRAM
        tristate "Uncached system RAM"
@@ -171,18 +171,6 @@ config MTDRAM_ERASE_SIZE
          as a module, it is also possible to specify this as a parameter when
          loading the module.
 
-#If not a module (I don't want to test it as a module)
-config MTDRAM_ABS_POS
-       hex "SRAM Hexadecimal Absolute position or 0"
-       depends on MTD_MTDRAM=y
-       default "0"
-       help
-         If you have system RAM accessible by the CPU but not used by Linux
-         in normal operation, you can give the physical address at which the
-         available RAM starts, and the MTDRAM driver will use it instead of
-         allocating space from Linux's available memory. Otherwise, leave
-         this set to zero. Most people will want to leave this as zero.
-
 config MTD_BLOCK2MTD
        tristate "MTD using block device"
        depends on BLOCK
index 9d68544..9cf7fcd 100644 (file)
@@ -73,14 +73,15 @@ static int m25p80_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
        return spi_write(spi, flash->command, len + 1);
 }
 
-static void m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
-                       size_t *retlen, const u_char *buf)
+static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
+                           const u_char *buf)
 {
        struct m25p *flash = nor->priv;
        struct spi_device *spi = flash->spi;
        struct spi_transfer t[2] = {};
        struct spi_message m;
        int cmd_sz = m25p_cmdsz(nor);
+       ssize_t ret;
 
        spi_message_init(&m);
 
@@ -98,9 +99,14 @@ static void m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
        t[1].len = len;
        spi_message_add_tail(&t[1], &m);
 
-       spi_sync(spi, &m);
+       ret = spi_sync(spi, &m);
+       if (ret)
+               return ret;
 
-       *retlen += m.actual_length - cmd_sz;
+       ret = m.actual_length - cmd_sz;
+       if (ret < 0)
+               return -EIO;
+       return ret;
 }
 
 static inline unsigned int m25p80_rx_nbits(struct spi_nor *nor)
@@ -119,21 +125,21 @@ static inline unsigned int m25p80_rx_nbits(struct spi_nor *nor)
  * Read an address range from the nor chip.  The address range
  * may be any size provided it is within the physical boundaries.
  */
-static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
-                       size_t *retlen, u_char *buf)
+static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
+                          u_char *buf)
 {
        struct m25p *flash = nor->priv;
        struct spi_device *spi = flash->spi;
        struct spi_transfer t[2];
        struct spi_message m;
        unsigned int dummy = nor->read_dummy;
+       ssize_t ret;
 
        /* convert the dummy cycles to the number of bytes */
        dummy /= 8;
 
        if (spi_flash_read_supported(spi)) {
                struct spi_flash_read_message msg;
-               int ret;
 
                memset(&msg, 0, sizeof(msg));
 
@@ -149,8 +155,9 @@ static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
                msg.data_nbits = m25p80_rx_nbits(nor);
 
                ret = spi_flash_read(spi, &msg);
-               *retlen = msg.retlen;
-               return ret;
+               if (ret < 0)
+                       return ret;
+               return msg.retlen;
        }
 
        spi_message_init(&m);
@@ -165,13 +172,17 @@ static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
 
        t[1].rx_buf = buf;
        t[1].rx_nbits = m25p80_rx_nbits(nor);
-       t[1].len = len;
+       t[1].len = min(len, spi_max_transfer_size(spi));
        spi_message_add_tail(&t[1], &m);
 
-       spi_sync(spi, &m);
+       ret = spi_sync(spi, &m);
+       if (ret)
+               return ret;
 
-       *retlen = m.actual_length - m25p_cmdsz(nor) - dummy;
-       return 0;
+       ret = m.actual_length - m25p_cmdsz(nor) - dummy;
+       if (ret < 0)
+               return -EIO;
+       return ret;
 }
 
 /*
index 22f3858..3fad359 100644 (file)
@@ -186,7 +186,7 @@ static int of_flash_probe(struct platform_device *dev)
         * consists internally of 2 non-identical NOR chips on one die.
         */
        p = of_get_property(dp, "reg", &count);
-       if (count % reg_tuple_size != 0) {
+       if (!p || count % reg_tuple_size != 0) {
                dev_err(&dev->dev, "Malformed reg property on %s\n",
                                dev->dev.of_node->full_name);
                err = -EINVAL;
index 744ca5c..f9fa3fa 100644 (file)
@@ -75,15 +75,15 @@ static int __init init_msp_flash(void)
 
        printk(KERN_NOTICE "Found %d PMC flash devices\n", fcnt);
 
-       msp_flash = kmalloc(fcnt * sizeof(struct map_info *), GFP_KERNEL);
+       msp_flash = kcalloc(fcnt, sizeof(*msp_flash), GFP_KERNEL);
        if (!msp_flash)
                return -ENOMEM;
 
-       msp_parts = kmalloc(fcnt * sizeof(struct mtd_partition *), GFP_KERNEL);
+       msp_parts = kcalloc(fcnt, sizeof(*msp_parts), GFP_KERNEL);
        if (!msp_parts)
                goto free_msp_flash;
 
-       msp_maps = kcalloc(fcnt, sizeof(struct mtd_info), GFP_KERNEL);
+       msp_maps = kcalloc(fcnt, sizeof(*msp_maps), GFP_KERNEL);
        if (!msp_maps)
                goto free_msp_parts;
 
index 142fc3d..784c6e1 100644 (file)
@@ -230,8 +230,10 @@ static struct sa_info *sa1100_setup_mtd(struct platform_device *pdev,
 
                info->mtd = mtd_concat_create(cdev, info->num_subdev,
                                              plat->name);
-               if (info->mtd == NULL)
+               if (info->mtd == NULL) {
                        ret = -ENXIO;
+                       goto err;
+               }
        }
        info->mtd->dev.parent = &pdev->dev;
 
index f05e0e9..21ff580 100644 (file)
@@ -438,7 +438,7 @@ config MTD_NAND_FSL_ELBC
 
 config MTD_NAND_FSL_IFC
        tristate "NAND support for Freescale IFC controller"
-       depends on MTD_NAND && FSL_SOC
+       depends on MTD_NAND && (FSL_SOC || ARCH_LAYERSCAPE)
        select FSL_IFC
        select MEMORY
        help
@@ -539,7 +539,6 @@ config MTD_NAND_FSMC
 config MTD_NAND_XWAY
        tristate "Support for NAND on Lantiq XWAY SoC"
        depends on LANTIQ && SOC_TYPE_XWAY
-       select MTD_NAND_PLATFORM
        help
          Enables support for NAND Flash chips on Lantiq XWAY SoCs. NAND is attached
          to the External Bus Unit (EBU).
@@ -563,4 +562,11 @@ config MTD_NAND_QCOM
          Enables support for NAND flash chips on SoCs containing the EBI2 NAND
          controller. This controller is found on IPQ806x SoC.
 
+config MTD_NAND_MTK
+       tristate "Support for NAND controller on MTK SoCs"
+       depends on HAS_DMA
+       help
+         Enables support for NAND controller on MTK SoCs.
+         This controller is found on mt27xx, mt81xx, mt65xx SoCs.
+
 endif # MTD_NAND
index f553353..cafde6f 100644 (file)
@@ -57,5 +57,6 @@ obj-$(CONFIG_MTD_NAND_SUNXI)          += sunxi_nand.o
 obj-$(CONFIG_MTD_NAND_HISI504)         += hisi504_nand.o
 obj-$(CONFIG_MTD_NAND_BRCMNAND)                += brcmnand/
 obj-$(CONFIG_MTD_NAND_QCOM)            += qcom_nandc.o
+obj-$(CONFIG_MTD_NAND_MTK)             += mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o
index b76ad7c..8eb2c64 100644 (file)
@@ -340,6 +340,36 @@ static const u16 brcmnand_regs_v71[] = {
        [BRCMNAND_FC_BASE]              = 0x400,
 };
 
+/* BRCMNAND v7.2 */
+static const u16 brcmnand_regs_v72[] = {
+       [BRCMNAND_CMD_START]            =  0x04,
+       [BRCMNAND_CMD_EXT_ADDRESS]      =  0x08,
+       [BRCMNAND_CMD_ADDRESS]          =  0x0c,
+       [BRCMNAND_INTFC_STATUS]         =  0x14,
+       [BRCMNAND_CS_SELECT]            =  0x18,
+       [BRCMNAND_CS_XOR]               =  0x1c,
+       [BRCMNAND_LL_OP]                =  0x20,
+       [BRCMNAND_CS0_BASE]             =  0x50,
+       [BRCMNAND_CS1_BASE]             =     0,
+       [BRCMNAND_CORR_THRESHOLD]       =  0xdc,
+       [BRCMNAND_CORR_THRESHOLD_EXT]   =  0xe0,
+       [BRCMNAND_UNCORR_COUNT]         =  0xfc,
+       [BRCMNAND_CORR_COUNT]           = 0x100,
+       [BRCMNAND_CORR_EXT_ADDR]        = 0x10c,
+       [BRCMNAND_CORR_ADDR]            = 0x110,
+       [BRCMNAND_UNCORR_EXT_ADDR]      = 0x114,
+       [BRCMNAND_UNCORR_ADDR]          = 0x118,
+       [BRCMNAND_SEMAPHORE]            = 0x150,
+       [BRCMNAND_ID]                   = 0x194,
+       [BRCMNAND_ID_EXT]               = 0x198,
+       [BRCMNAND_LL_RDATA]             = 0x19c,
+       [BRCMNAND_OOB_READ_BASE]        = 0x200,
+       [BRCMNAND_OOB_READ_10_BASE]     =     0,
+       [BRCMNAND_OOB_WRITE_BASE]       = 0x400,
+       [BRCMNAND_OOB_WRITE_10_BASE]    =     0,
+       [BRCMNAND_FC_BASE]              = 0x600,
+};
+
 enum brcmnand_cs_reg {
        BRCMNAND_CS_CFG_EXT = 0,
        BRCMNAND_CS_CFG,
@@ -435,7 +465,9 @@ static int brcmnand_revision_init(struct brcmnand_controller *ctrl)
        }
 
        /* Register offsets */
-       if (ctrl->nand_version >= 0x0701)
+       if (ctrl->nand_version >= 0x0702)
+               ctrl->reg_offsets = brcmnand_regs_v72;
+       else if (ctrl->nand_version >= 0x0701)
                ctrl->reg_offsets = brcmnand_regs_v71;
        else if (ctrl->nand_version >= 0x0600)
                ctrl->reg_offsets = brcmnand_regs_v60;
@@ -480,7 +512,9 @@ static int brcmnand_revision_init(struct brcmnand_controller *ctrl)
        }
 
        /* Maximum spare area sector size (per 512B) */
-       if (ctrl->nand_version >= 0x0600)
+       if (ctrl->nand_version >= 0x0702)
+               ctrl->max_oob = 128;
+       else if (ctrl->nand_version >= 0x0600)
                ctrl->max_oob = 64;
        else if (ctrl->nand_version >= 0x0500)
                ctrl->max_oob = 32;
@@ -583,14 +617,20 @@ static void brcmnand_wr_corr_thresh(struct brcmnand_host *host, u8 val)
        enum brcmnand_reg reg = BRCMNAND_CORR_THRESHOLD;
        int cs = host->cs;
 
-       if (ctrl->nand_version >= 0x0600)
+       if (ctrl->nand_version >= 0x0702)
+               bits = 7;
+       else if (ctrl->nand_version >= 0x0600)
                bits = 6;
        else if (ctrl->nand_version >= 0x0500)
                bits = 5;
        else
                bits = 4;
 
-       if (ctrl->nand_version >= 0x0600) {
+       if (ctrl->nand_version >= 0x0702) {
+               if (cs >= 4)
+                       reg = BRCMNAND_CORR_THRESHOLD_EXT;
+               shift = (cs % 4) * bits;
+       } else if (ctrl->nand_version >= 0x0600) {
                if (cs >= 5)
                        reg = BRCMNAND_CORR_THRESHOLD_EXT;
                shift = (cs % 5) * bits;
@@ -631,19 +671,28 @@ enum {
 
 static inline u32 brcmnand_spare_area_mask(struct brcmnand_controller *ctrl)
 {
-       if (ctrl->nand_version >= 0x0600)
+       if (ctrl->nand_version >= 0x0702)
+               return GENMASK(7, 0);
+       else if (ctrl->nand_version >= 0x0600)
                return GENMASK(6, 0);
        else
                return GENMASK(5, 0);
 }
 
 #define NAND_ACC_CONTROL_ECC_SHIFT     16
+#define NAND_ACC_CONTROL_ECC_EXT_SHIFT 13
 
 static inline u32 brcmnand_ecc_level_mask(struct brcmnand_controller *ctrl)
 {
        u32 mask = (ctrl->nand_version >= 0x0600) ? 0x1f : 0x0f;
 
-       return mask << NAND_ACC_CONTROL_ECC_SHIFT;
+       mask <<= NAND_ACC_CONTROL_ECC_SHIFT;
+
+       /* v7.2 includes additional ECC levels */
+       if (ctrl->nand_version >= 0x0702)
+               mask |= 0x7 << NAND_ACC_CONTROL_ECC_EXT_SHIFT;
+
+       return mask;
 }
 
 static void brcmnand_set_ecc_enabled(struct brcmnand_host *host, int en)
@@ -667,7 +716,9 @@ static void brcmnand_set_ecc_enabled(struct brcmnand_host *host, int en)
 
 static inline int brcmnand_sector_1k_shift(struct brcmnand_controller *ctrl)
 {
-       if (ctrl->nand_version >= 0x0600)
+       if (ctrl->nand_version >= 0x0702)
+               return 9;
+       else if (ctrl->nand_version >= 0x0600)
                return 7;
        else if (ctrl->nand_version >= 0x0500)
                return 6;
@@ -773,10 +824,16 @@ enum brcmnand_llop_type {
  * Internal support functions
  ***********************************************************************/
 
-static inline bool is_hamming_ecc(struct brcmnand_cfg *cfg)
+static inline bool is_hamming_ecc(struct brcmnand_controller *ctrl,
+                                 struct brcmnand_cfg *cfg)
 {
-       return cfg->sector_size_1k == 0 && cfg->spare_area_size == 16 &&
-               cfg->ecc_level == 15;
+       if (ctrl->nand_version <= 0x0701)
+               return cfg->sector_size_1k == 0 && cfg->spare_area_size == 16 &&
+                       cfg->ecc_level == 15;
+       else
+               return cfg->sector_size_1k == 0 && ((cfg->spare_area_size == 16 &&
+                       cfg->ecc_level == 15) ||
+                       (cfg->spare_area_size == 28 && cfg->ecc_level == 16));
 }
 
 /*
@@ -931,7 +988,7 @@ static int brcmstb_choose_ecc_layout(struct brcmnand_host *host)
        if (p->sector_size_1k)
                ecc_level <<= 1;
 
-       if (is_hamming_ecc(p)) {
+       if (is_hamming_ecc(host->ctrl, p)) {
                ecc->bytes = 3 * sectors;
                mtd_set_ooblayout(mtd, &brcmnand_hamming_ooblayout_ops);
                return 0;
@@ -1108,7 +1165,7 @@ static void brcmnand_send_cmd(struct brcmnand_host *host, int cmd)
        ctrl->cmd_pending = cmd;
 
        intfc = brcmnand_read_reg(ctrl, BRCMNAND_INTFC_STATUS);
-       BUG_ON(!(intfc & INTFC_CTLR_READY));
+       WARN_ON(!(intfc & INTFC_CTLR_READY));
 
        mb(); /* flush previous writes */
        brcmnand_write_reg(ctrl, BRCMNAND_CMD_START,
@@ -1545,6 +1602,56 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
        return ret;
 }
 
+/*
+ * Check a page to see if it is erased (w/ bitflips) after an uncorrectable ECC
+ * error
+ *
+ * Because the HW ECC signals an ECC error if an erase paged has even a single
+ * bitflip, we must check each ECC error to see if it is actually an erased
+ * page with bitflips, not a truly corrupted page.
+ *
+ * On a real error, return a negative error code (-EBADMSG for ECC error), and
+ * buf will contain raw data.
+ * Otherwise, buf gets filled with 0xffs and return the maximum number of
+ * bitflips-per-ECC-sector to the caller.
+ *
+ */
+static int brcmstb_nand_verify_erased_page(struct mtd_info *mtd,
+                 struct nand_chip *chip, void *buf, u64 addr)
+{
+       int i, sas;
+       void *oob = chip->oob_poi;
+       int bitflips = 0;
+       int page = addr >> chip->page_shift;
+       int ret;
+
+       if (!buf) {
+               buf = chip->buffers->databuf;
+               /* Invalidate page cache */
+               chip->pagebuf = -1;
+       }
+
+       sas = mtd->oobsize / chip->ecc.steps;
+
+       /* read without ecc for verification */
+       chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
+       ret = chip->ecc.read_page_raw(mtd, chip, buf, true, page);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < chip->ecc.steps; i++, oob += sas) {
+               ret = nand_check_erased_ecc_chunk(buf, chip->ecc.size,
+                                                 oob, sas, NULL, 0,
+                                                 chip->ecc.strength);
+               if (ret < 0)
+                       return ret;
+
+               bitflips = max(bitflips, ret);
+       }
+
+       return bitflips;
+}
+
 static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
                         u64 addr, unsigned int trans, u32 *buf, u8 *oob)
 {
@@ -1552,9 +1659,11 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
        struct brcmnand_controller *ctrl = host->ctrl;
        u64 err_addr = 0;
        int err;
+       bool retry = true;
 
        dev_dbg(ctrl->dev, "read %llx -> %p\n", (unsigned long long)addr, buf);
 
+try_dmaread:
        brcmnand_write_reg(ctrl, BRCMNAND_UNCORR_COUNT, 0);
 
        if (has_flash_dma(ctrl) && !oob && flash_dma_buf_ok(buf)) {
@@ -1575,6 +1684,34 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
        }
 
        if (mtd_is_eccerr(err)) {
+               /*
+                * On controller version and 7.0, 7.1 , DMA read after a
+                * prior PIO read that reported uncorrectable error,
+                * the DMA engine captures this error following DMA read
+                * cleared only on subsequent DMA read, so just retry once
+                * to clear a possible false error reported for current DMA
+                * read
+                */
+               if ((ctrl->nand_version == 0x0700) ||
+                   (ctrl->nand_version == 0x0701)) {
+                       if (retry) {
+                               retry = false;
+                               goto try_dmaread;
+                       }
+               }
+
+               /*
+                * Controller version 7.2 has hw encoder to detect erased page
+                * bitflips, apply sw verification for older controllers only
+                */
+               if (ctrl->nand_version < 0x0702) {
+                       err = brcmstb_nand_verify_erased_page(mtd, chip, buf,
+                                                             addr);
+                       /* erased page bitflips corrected */
+                       if (err > 0)
+                               return err;
+               }
+
                dev_dbg(ctrl->dev, "uncorrectable error at 0x%llx\n",
                        (unsigned long long)err_addr);
                mtd->ecc_stats.failed++;
@@ -1857,7 +1994,8 @@ static int brcmnand_set_cfg(struct brcmnand_host *host,
        return 0;
 }
 
-static void brcmnand_print_cfg(char *buf, struct brcmnand_cfg *cfg)
+static void brcmnand_print_cfg(struct brcmnand_host *host,
+                              char *buf, struct brcmnand_cfg *cfg)
 {
        buf += sprintf(buf,
                "%lluMiB total, %uKiB blocks, %u%s pages, %uB OOB, %u-bit",
@@ -1868,7 +2006,7 @@ static void brcmnand_print_cfg(char *buf, struct brcmnand_cfg *cfg)
                cfg->spare_area_size, cfg->device_width);
 
        /* Account for Hamming ECC and for BCH 512B vs 1KiB sectors */
-       if (is_hamming_ecc(cfg))
+       if (is_hamming_ecc(host->ctrl, cfg))
                sprintf(buf, ", Hamming ECC");
        else if (cfg->sector_size_1k)
                sprintf(buf, ", BCH-%u (1KiB sector)", cfg->ecc_level << 1);
@@ -1987,7 +2125,7 @@ static int brcmnand_setup_dev(struct brcmnand_host *host)
 
        brcmnand_set_ecc_enabled(host, 1);
 
-       brcmnand_print_cfg(msg, cfg);
+       brcmnand_print_cfg(host, msg, cfg);
        dev_info(ctrl->dev, "detected %s\n", msg);
 
        /* Configure ACC_CONTROL */
@@ -1995,6 +2133,10 @@ static int brcmnand_setup_dev(struct brcmnand_host *host)
        tmp = nand_readreg(ctrl, offs);
        tmp &= ~ACC_CONTROL_PARTIAL_PAGE;
        tmp &= ~ACC_CONTROL_RD_ERASED;
+
+       /* We need to turn on Read from erased paged protected by ECC */
+       if (ctrl->nand_version >= 0x0702)
+               tmp |= ACC_CONTROL_RD_ERASED;
        tmp &= ~ACC_CONTROL_FAST_PGM_RDIN;
        if (ctrl->features & BRCMNAND_HAS_PREFETCH) {
                /*
@@ -2195,6 +2337,7 @@ static const struct of_device_id brcmnand_of_match[] = {
        { .compatible = "brcm,brcmnand-v6.2" },
        { .compatible = "brcm,brcmnand-v7.0" },
        { .compatible = "brcm,brcmnand-v7.1" },
+       { .compatible = "brcm,brcmnand-v7.2" },
        {},
 };
 MODULE_DEVICE_TABLE(of, brcmnand_of_match);
index d74f4ba..731c605 100644 (file)
@@ -375,6 +375,6 @@ static struct platform_driver jz4780_bch_driver = {
 module_platform_driver(jz4780_bch_driver);
 
 MODULE_AUTHOR("Alex Smith <alex@alex-smith.me.uk>");
-MODULE_AUTHOR("Harvey Hunt <harvey.hunt@imgtec.com>");
+MODULE_AUTHOR("Harvey Hunt <harveyhuntnexus@gmail.com>");
 MODULE_DESCRIPTION("Ingenic JZ4780 BCH error correction driver");
 MODULE_LICENSE("GPL v2");
index daf3c42..175f67d 100644 (file)
@@ -412,6 +412,6 @@ static struct platform_driver jz4780_nand_driver = {
 module_platform_driver(jz4780_nand_driver);
 
 MODULE_AUTHOR("Alex Smith <alex@alex-smith.me.uk>");
-MODULE_AUTHOR("Harvey Hunt <harvey.hunt@imgtec.com>");
+MODULE_AUTHOR("Harvey Hunt <harveyhuntnexus@gmail.com>");
 MODULE_DESCRIPTION("Ingenic JZ4780 NAND driver");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/nand/mtk_ecc.c b/drivers/mtd/nand/mtk_ecc.c
new file mode 100644 (file)
index 0000000..25a4fbd
--- /dev/null
@@ -0,0 +1,530 @@
+/*
+ * MTK ECC controller driver.
+ * Copyright (C) 2016  MediaTek Inc.
+ * Authors:    Xiaolei Li              <xiaolei.li@mediatek.com>
+ *             Jorge Ramirez-Ortiz     <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/iopoll.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/mutex.h>
+
+#include "mtk_ecc.h"
+
+#define ECC_IDLE_MASK          BIT(0)
+#define ECC_IRQ_EN             BIT(0)
+#define ECC_OP_ENABLE          (1)
+#define ECC_OP_DISABLE         (0)
+
+#define ECC_ENCCON             (0x00)
+#define ECC_ENCCNFG            (0x04)
+#define                ECC_CNFG_4BIT           (0)
+#define                ECC_CNFG_6BIT           (1)
+#define                ECC_CNFG_8BIT           (2)
+#define                ECC_CNFG_10BIT          (3)
+#define                ECC_CNFG_12BIT          (4)
+#define                ECC_CNFG_14BIT          (5)
+#define                ECC_CNFG_16BIT          (6)
+#define                ECC_CNFG_18BIT          (7)
+#define                ECC_CNFG_20BIT          (8)
+#define                ECC_CNFG_22BIT          (9)
+#define                ECC_CNFG_24BIT          (0xa)
+#define                ECC_CNFG_28BIT          (0xb)
+#define                ECC_CNFG_32BIT          (0xc)
+#define                ECC_CNFG_36BIT          (0xd)
+#define                ECC_CNFG_40BIT          (0xe)
+#define                ECC_CNFG_44BIT          (0xf)
+#define                ECC_CNFG_48BIT          (0x10)
+#define                ECC_CNFG_52BIT          (0x11)
+#define                ECC_CNFG_56BIT          (0x12)
+#define                ECC_CNFG_60BIT          (0x13)
+#define                ECC_MODE_SHIFT          (5)
+#define                ECC_MS_SHIFT            (16)
+#define ECC_ENCDIADDR          (0x08)
+#define ECC_ENCIDLE            (0x0C)
+#define ECC_ENCPAR(x)          (0x10 + (x) * sizeof(u32))
+#define ECC_ENCIRQ_EN          (0x80)
+#define ECC_ENCIRQ_STA         (0x84)
+#define ECC_DECCON             (0x100)
+#define ECC_DECCNFG            (0x104)
+#define                DEC_EMPTY_EN            BIT(31)
+#define                DEC_CNFG_CORRECT        (0x3 << 12)
+#define ECC_DECIDLE            (0x10C)
+#define ECC_DECENUM0           (0x114)
+#define                ERR_MASK                (0x3f)
+#define ECC_DECDONE            (0x124)
+#define ECC_DECIRQ_EN          (0x200)
+#define ECC_DECIRQ_STA         (0x204)
+
+#define ECC_TIMEOUT            (500000)
+
+#define ECC_IDLE_REG(op)       ((op) == ECC_ENCODE ? ECC_ENCIDLE : ECC_DECIDLE)
+#define ECC_CTL_REG(op)                ((op) == ECC_ENCODE ? ECC_ENCCON : ECC_DECCON)
+#define ECC_IRQ_REG(op)                ((op) == ECC_ENCODE ? \
+                                       ECC_ENCIRQ_EN : ECC_DECIRQ_EN)
+
+struct mtk_ecc {
+       struct device *dev;
+       void __iomem *regs;
+       struct clk *clk;
+
+       struct completion done;
+       struct mutex lock;
+       u32 sectors;
+};
+
+static inline void mtk_ecc_wait_idle(struct mtk_ecc *ecc,
+                                    enum mtk_ecc_operation op)
+{
+       struct device *dev = ecc->dev;
+       u32 val;
+       int ret;
+
+       ret = readl_poll_timeout_atomic(ecc->regs + ECC_IDLE_REG(op), val,
+                                       val & ECC_IDLE_MASK,
+                                       10, ECC_TIMEOUT);
+       if (ret)
+               dev_warn(dev, "%s NOT idle\n",
+                        op == ECC_ENCODE ? "encoder" : "decoder");
+}
+
+static irqreturn_t mtk_ecc_irq(int irq, void *id)
+{
+       struct mtk_ecc *ecc = id;
+       enum mtk_ecc_operation op;
+       u32 dec, enc;
+
+       dec = readw(ecc->regs + ECC_DECIRQ_STA) & ECC_IRQ_EN;
+       if (dec) {
+               op = ECC_DECODE;
+               dec = readw(ecc->regs + ECC_DECDONE);
+               if (dec & ecc->sectors) {
+                       ecc->sectors = 0;
+                       complete(&ecc->done);
+               } else {
+                       return IRQ_HANDLED;
+               }
+       } else {
+               enc = readl(ecc->regs + ECC_ENCIRQ_STA) & ECC_IRQ_EN;
+               if (enc) {
+                       op = ECC_ENCODE;
+                       complete(&ecc->done);
+               } else {
+                       return IRQ_NONE;
+               }
+       }
+
+       writel(0, ecc->regs + ECC_IRQ_REG(op));
+
+       return IRQ_HANDLED;
+}
+
+static void mtk_ecc_config(struct mtk_ecc *ecc, struct mtk_ecc_config *config)
+{
+       u32 ecc_bit = ECC_CNFG_4BIT, dec_sz, enc_sz;
+       u32 reg;
+
+       switch (config->strength) {
+       case 4:
+               ecc_bit = ECC_CNFG_4BIT;
+               break;
+       case 6:
+               ecc_bit = ECC_CNFG_6BIT;
+               break;
+       case 8:
+               ecc_bit = ECC_CNFG_8BIT;
+               break;
+       case 10:
+               ecc_bit = ECC_CNFG_10BIT;
+               break;
+       case 12:
+               ecc_bit = ECC_CNFG_12BIT;
+               break;
+       case 14:
+               ecc_bit = ECC_CNFG_14BIT;
+               break;
+       case 16:
+               ecc_bit = ECC_CNFG_16BIT;
+               break;
+       case 18:
+               ecc_bit = ECC_CNFG_18BIT;
+               break;
+       case 20:
+               ecc_bit = ECC_CNFG_20BIT;
+               break;
+       case 22:
+               ecc_bit = ECC_CNFG_22BIT;
+               break;
+       case 24:
+               ecc_bit = ECC_CNFG_24BIT;
+               break;
+       case 28:
+               ecc_bit = ECC_CNFG_28BIT;
+               break;
+       case 32:
+               ecc_bit = ECC_CNFG_32BIT;
+               break;
+       case 36:
+               ecc_bit = ECC_CNFG_36BIT;
+               break;
+       case 40:
+               ecc_bit = ECC_CNFG_40BIT;
+               break;
+       case 44:
+               ecc_bit = ECC_CNFG_44BIT;
+               break;
+       case 48:
+               ecc_bit = ECC_CNFG_48BIT;
+               break;
+       case 52:
+               ecc_bit = ECC_CNFG_52BIT;
+               break;
+       case 56:
+               ecc_bit = ECC_CNFG_56BIT;
+               break;
+       case 60:
+               ecc_bit = ECC_CNFG_60BIT;
+               break;
+       default:
+               dev_err(ecc->dev, "invalid strength %d, default to 4 bits\n",
+                       config->strength);
+       }
+
+       if (config->op == ECC_ENCODE) {
+               /* configure ECC encoder (in bits) */
+               enc_sz = config->len << 3;
+
+               reg = ecc_bit | (config->mode << ECC_MODE_SHIFT);
+               reg |= (enc_sz << ECC_MS_SHIFT);
+               writel(reg, ecc->regs + ECC_ENCCNFG);
+
+               if (config->mode != ECC_NFI_MODE)
+                       writel(lower_32_bits(config->addr),
+                              ecc->regs + ECC_ENCDIADDR);
+
+       } else {
+               /* configure ECC decoder (in bits) */
+               dec_sz = (config->len << 3) +
+                                       config->strength * ECC_PARITY_BITS;
+
+               reg = ecc_bit | (config->mode << ECC_MODE_SHIFT);
+               reg |= (dec_sz << ECC_MS_SHIFT) | DEC_CNFG_CORRECT;
+               reg |= DEC_EMPTY_EN;
+               writel(reg, ecc->regs + ECC_DECCNFG);
+
+               if (config->sectors)
+                       ecc->sectors = 1 << (config->sectors - 1);
+       }
+}
+
+void mtk_ecc_get_stats(struct mtk_ecc *ecc, struct mtk_ecc_stats *stats,
+                      int sectors)
+{
+       u32 offset, i, err;
+       u32 bitflips = 0;
+
+       stats->corrected = 0;
+       stats->failed = 0;
+
+       for (i = 0; i < sectors; i++) {
+               offset = (i >> 2) << 2;
+               err = readl(ecc->regs + ECC_DECENUM0 + offset);
+               err = err >> ((i % 4) * 8);
+               err &= ERR_MASK;
+               if (err == ERR_MASK) {
+                       /* uncorrectable errors */
+                       stats->failed++;
+                       continue;
+               }
+
+               stats->corrected += err;
+               bitflips = max_t(u32, bitflips, err);
+       }
+
+       stats->bitflips = bitflips;
+}
+EXPORT_SYMBOL(mtk_ecc_get_stats);
+
+void mtk_ecc_release(struct mtk_ecc *ecc)
+{
+       clk_disable_unprepare(ecc->clk);
+       put_device(ecc->dev);
+}
+EXPORT_SYMBOL(mtk_ecc_release);
+
+static void mtk_ecc_hw_init(struct mtk_ecc *ecc)
+{
+       mtk_ecc_wait_idle(ecc, ECC_ENCODE);
+       writew(ECC_OP_DISABLE, ecc->regs + ECC_ENCCON);
+
+       mtk_ecc_wait_idle(ecc, ECC_DECODE);
+       writel(ECC_OP_DISABLE, ecc->regs + ECC_DECCON);
+}
+
+static struct mtk_ecc *mtk_ecc_get(struct device_node *np)
+{
+       struct platform_device *pdev;
+       struct mtk_ecc *ecc;
+
+       pdev = of_find_device_by_node(np);
+       if (!pdev || !platform_get_drvdata(pdev))
+               return ERR_PTR(-EPROBE_DEFER);
+
+       get_device(&pdev->dev);
+       ecc = platform_get_drvdata(pdev);
+       clk_prepare_enable(ecc->clk);
+       mtk_ecc_hw_init(ecc);
+
+       return ecc;
+}
+
+struct mtk_ecc *of_mtk_ecc_get(struct device_node *of_node)
+{
+       struct mtk_ecc *ecc = NULL;
+       struct device_node *np;
+
+       np = of_parse_phandle(of_node, "ecc-engine", 0);
+       if (np) {
+               ecc = mtk_ecc_get(np);
+               of_node_put(np);
+       }
+
+       return ecc;
+}
+EXPORT_SYMBOL(of_mtk_ecc_get);
+
+int mtk_ecc_enable(struct mtk_ecc *ecc, struct mtk_ecc_config *config)
+{
+       enum mtk_ecc_operation op = config->op;
+       int ret;
+
+       ret = mutex_lock_interruptible(&ecc->lock);
+       if (ret) {
+               dev_err(ecc->dev, "interrupted when attempting to lock\n");
+               return ret;
+       }
+
+       mtk_ecc_wait_idle(ecc, op);
+       mtk_ecc_config(ecc, config);
+       writew(ECC_OP_ENABLE, ecc->regs + ECC_CTL_REG(op));
+
+       init_completion(&ecc->done);
+       writew(ECC_IRQ_EN, ecc->regs + ECC_IRQ_REG(op));
+
+       return 0;
+}
+EXPORT_SYMBOL(mtk_ecc_enable);
+
+void mtk_ecc_disable(struct mtk_ecc *ecc)
+{
+       enum mtk_ecc_operation op = ECC_ENCODE;
+
+       /* find out the running operation */
+       if (readw(ecc->regs + ECC_CTL_REG(op)) != ECC_OP_ENABLE)
+               op = ECC_DECODE;
+
+       /* disable it */
+       mtk_ecc_wait_idle(ecc, op);
+       writew(0, ecc->regs + ECC_IRQ_REG(op));
+       writew(ECC_OP_DISABLE, ecc->regs + ECC_CTL_REG(op));
+
+       mutex_unlock(&ecc->lock);
+}
+EXPORT_SYMBOL(mtk_ecc_disable);
+
+int mtk_ecc_wait_done(struct mtk_ecc *ecc, enum mtk_ecc_operation op)
+{
+       int ret;
+
+       ret = wait_for_completion_timeout(&ecc->done, msecs_to_jiffies(500));
+       if (!ret) {
+               dev_err(ecc->dev, "%s timeout - interrupt did not arrive)\n",
+                       (op == ECC_ENCODE) ? "encoder" : "decoder");
+               return -ETIMEDOUT;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(mtk_ecc_wait_done);
+
+int mtk_ecc_encode(struct mtk_ecc *ecc, struct mtk_ecc_config *config,
+                  u8 *data, u32 bytes)
+{
+       dma_addr_t addr;
+       u32 *p, len, i;
+       int ret = 0;
+
+       addr = dma_map_single(ecc->dev, data, bytes, DMA_TO_DEVICE);
+       ret = dma_mapping_error(ecc->dev, addr);
+       if (ret) {
+               dev_err(ecc->dev, "dma mapping error\n");
+               return -EINVAL;
+       }
+
+       config->op = ECC_ENCODE;
+       config->addr = addr;
+       ret = mtk_ecc_enable(ecc, config);
+       if (ret) {
+               dma_unmap_single(ecc->dev, addr, bytes, DMA_TO_DEVICE);
+               return ret;
+       }
+
+       ret = mtk_ecc_wait_done(ecc, ECC_ENCODE);
+       if (ret)
+               goto timeout;
+
+       mtk_ecc_wait_idle(ecc, ECC_ENCODE);
+
+       /* Program ECC bytes to OOB: per sector oob = FDM + ECC + SPARE */
+       len = (config->strength * ECC_PARITY_BITS + 7) >> 3;
+       p = (u32 *)(data + bytes);
+
+       /* write the parity bytes generated by the ECC back to the OOB region */
+       for (i = 0; i < len; i++)
+               p[i] = readl(ecc->regs + ECC_ENCPAR(i));
+timeout:
+
+       dma_unmap_single(ecc->dev, addr, bytes, DMA_TO_DEVICE);
+       mtk_ecc_disable(ecc);
+
+       return ret;
+}
+EXPORT_SYMBOL(mtk_ecc_encode);
+
+void mtk_ecc_adjust_strength(u32 *p)
+{
+       u32 ecc[] = {4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 28, 32, 36,
+                       40, 44, 48, 52, 56, 60};
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(ecc); i++) {
+               if (*p <= ecc[i]) {
+                       if (!i)
+                               *p = ecc[i];
+                       else if (*p != ecc[i])
+                               *p = ecc[i - 1];
+                       return;
+               }
+       }
+
+       *p = ecc[ARRAY_SIZE(ecc) - 1];
+}
+EXPORT_SYMBOL(mtk_ecc_adjust_strength);
+
+static int mtk_ecc_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct mtk_ecc *ecc;
+       struct resource *res;
+       int irq, ret;
+
+       ecc = devm_kzalloc(dev, sizeof(*ecc), GFP_KERNEL);
+       if (!ecc)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       ecc->regs = devm_ioremap_resource(dev, res);
+       if (IS_ERR(ecc->regs)) {
+               dev_err(dev, "failed to map regs: %ld\n", PTR_ERR(ecc->regs));
+               return PTR_ERR(ecc->regs);
+       }
+
+       ecc->clk = devm_clk_get(dev, NULL);
+       if (IS_ERR(ecc->clk)) {
+               dev_err(dev, "failed to get clock: %ld\n", PTR_ERR(ecc->clk));
+               return PTR_ERR(ecc->clk);
+       }
+
+       irq = platform_get_irq(pdev, 0);
+       if (irq < 0) {
+               dev_err(dev, "failed to get irq\n");
+               return -EINVAL;
+       }
+
+       ret = dma_set_mask(dev, DMA_BIT_MASK(32));
+       if (ret) {
+               dev_err(dev, "failed to set DMA mask\n");
+               return ret;
+       }
+
+       ret = devm_request_irq(dev, irq, mtk_ecc_irq, 0x0, "mtk-ecc", ecc);
+       if (ret) {
+               dev_err(dev, "failed to request irq\n");
+               return -EINVAL;
+       }
+
+       ecc->dev = dev;
+       mutex_init(&ecc->lock);
+       platform_set_drvdata(pdev, ecc);
+       dev_info(dev, "probed\n");
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int mtk_ecc_suspend(struct device *dev)
+{
+       struct mtk_ecc *ecc = dev_get_drvdata(dev);
+
+       clk_disable_unprepare(ecc->clk);
+
+       return 0;
+}
+
+static int mtk_ecc_resume(struct device *dev)
+{
+       struct mtk_ecc *ecc = dev_get_drvdata(dev);
+       int ret;
+
+       ret = clk_prepare_enable(ecc->clk);
+       if (ret) {
+               dev_err(dev, "failed to enable clk\n");
+               return ret;
+       }
+
+       mtk_ecc_hw_init(ecc);
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(mtk_ecc_pm_ops, mtk_ecc_suspend, mtk_ecc_resume);
+#endif
+
+static const struct of_device_id mtk_ecc_dt_match[] = {
+       { .compatible = "mediatek,mt2701-ecc" },
+       {},
+};
+
+MODULE_DEVICE_TABLE(of, mtk_ecc_dt_match);
+
+static struct platform_driver mtk_ecc_driver = {
+       .probe  = mtk_ecc_probe,
+       .driver = {
+               .name  = "mtk-ecc",
+               .of_match_table = of_match_ptr(mtk_ecc_dt_match),
+#ifdef CONFIG_PM_SLEEP
+               .pm = &mtk_ecc_pm_ops,
+#endif
+       },
+};
+
+module_platform_driver(mtk_ecc_driver);
+
+MODULE_AUTHOR("Xiaolei Li <xiaolei.li@mediatek.com>");
+MODULE_DESCRIPTION("MTK Nand ECC Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/nand/mtk_ecc.h b/drivers/mtd/nand/mtk_ecc.h
new file mode 100644 (file)
index 0000000..cbeba5c
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * MTK SDG1 ECC controller
+ *
+ * Copyright (c) 2016 Mediatek
+ * Authors:    Xiaolei Li              <xiaolei.li@mediatek.com>
+ *             Jorge Ramirez-Ortiz     <jorge.ramirez-ortiz@linaro.org>
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef __DRIVERS_MTD_NAND_MTK_ECC_H__
+#define __DRIVERS_MTD_NAND_MTK_ECC_H__
+
+#include <linux/types.h>
+
+#define ECC_PARITY_BITS                (14)
+
+enum mtk_ecc_mode {ECC_DMA_MODE = 0, ECC_NFI_MODE = 1};
+enum mtk_ecc_operation {ECC_ENCODE, ECC_DECODE};
+
+struct device_node;
+struct mtk_ecc;
+
+struct mtk_ecc_stats {
+       u32 corrected;
+       u32 bitflips;
+       u32 failed;
+};
+
+struct mtk_ecc_config {
+       enum mtk_ecc_operation op;
+       enum mtk_ecc_mode mode;
+       dma_addr_t addr;
+       u32 strength;
+       u32 sectors;
+       u32 len;
+};
+
+int mtk_ecc_encode(struct mtk_ecc *, struct mtk_ecc_config *, u8 *, u32);
+void mtk_ecc_get_stats(struct mtk_ecc *, struct mtk_ecc_stats *, int);
+int mtk_ecc_wait_done(struct mtk_ecc *, enum mtk_ecc_operation);
+int mtk_ecc_enable(struct mtk_ecc *, struct mtk_ecc_config *);
+void mtk_ecc_disable(struct mtk_ecc *);
+void mtk_ecc_adjust_strength(u32 *);
+
+struct mtk_ecc *of_mtk_ecc_get(struct device_node *);
+void mtk_ecc_release(struct mtk_ecc *);
+
+#endif
diff --git a/drivers/mtd/nand/mtk_nand.c b/drivers/mtd/nand/mtk_nand.c
new file mode 100644 (file)
index 0000000..ddaa2ac
--- /dev/null
@@ -0,0 +1,1526 @@
+/*
+ * MTK NAND Flash controller driver.
+ * Copyright (C) 2016 MediaTek Inc.
+ * Authors:    Xiaolei Li              <xiaolei.li@mediatek.com>
+ *             Jorge Ramirez-Ortiz     <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/clk.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/mtd.h>
+#include <linux/module.h>
+#include <linux/iopoll.h>
+#include <linux/of.h>
+#include "mtk_ecc.h"
+
+/* NAND controller register definition */
+#define NFI_CNFG               (0x00)
+#define                CNFG_AHB                BIT(0)
+#define                CNFG_READ_EN            BIT(1)
+#define                CNFG_DMA_BURST_EN       BIT(2)
+#define                CNFG_BYTE_RW            BIT(6)
+#define                CNFG_HW_ECC_EN          BIT(8)
+#define                CNFG_AUTO_FMT_EN        BIT(9)
+#define                CNFG_OP_CUST            (6 << 12)
+#define NFI_PAGEFMT            (0x04)
+#define                PAGEFMT_FDM_ECC_SHIFT   (12)
+#define                PAGEFMT_FDM_SHIFT       (8)
+#define                PAGEFMT_SPARE_16        (0)
+#define                PAGEFMT_SPARE_26        (1)
+#define                PAGEFMT_SPARE_27        (2)
+#define                PAGEFMT_SPARE_28        (3)
+#define                PAGEFMT_SPARE_32        (4)
+#define                PAGEFMT_SPARE_36        (5)
+#define                PAGEFMT_SPARE_40        (6)
+#define                PAGEFMT_SPARE_44        (7)
+#define                PAGEFMT_SPARE_48        (8)
+#define                PAGEFMT_SPARE_49        (9)
+#define                PAGEFMT_SPARE_50        (0xa)
+#define                PAGEFMT_SPARE_51        (0xb)
+#define                PAGEFMT_SPARE_52        (0xc)
+#define                PAGEFMT_SPARE_62        (0xd)
+#define                PAGEFMT_SPARE_63        (0xe)
+#define                PAGEFMT_SPARE_64        (0xf)
+#define                PAGEFMT_SPARE_SHIFT     (4)
+#define                PAGEFMT_SEC_SEL_512     BIT(2)
+#define                PAGEFMT_512_2K          (0)
+#define                PAGEFMT_2K_4K           (1)
+#define                PAGEFMT_4K_8K           (2)
+#define                PAGEFMT_8K_16K          (3)
+/* NFI control */
+#define NFI_CON                        (0x08)
+#define                CON_FIFO_FLUSH          BIT(0)
+#define                CON_NFI_RST             BIT(1)
+#define                CON_BRD                 BIT(8)  /* burst  read */
+#define                CON_BWR                 BIT(9)  /* burst  write */
+#define                CON_SEC_SHIFT           (12)
+/* Timming control register */
+#define NFI_ACCCON             (0x0C)
+#define NFI_INTR_EN            (0x10)
+#define                INTR_AHB_DONE_EN        BIT(6)
+#define NFI_INTR_STA           (0x14)
+#define NFI_CMD                        (0x20)
+#define NFI_ADDRNOB            (0x30)
+#define NFI_COLADDR            (0x34)
+#define NFI_ROWADDR            (0x38)
+#define NFI_STRDATA            (0x40)
+#define                STAR_EN                 (1)
+#define                STAR_DE                 (0)
+#define NFI_CNRNB              (0x44)
+#define NFI_DATAW              (0x50)
+#define NFI_DATAR              (0x54)
+#define NFI_PIO_DIRDY          (0x58)
+#define                PIO_DI_RDY              (0x01)
+#define NFI_STA                        (0x60)
+#define                STA_CMD                 BIT(0)
+#define                STA_ADDR                BIT(1)
+#define                STA_BUSY                BIT(8)
+#define                STA_EMP_PAGE            BIT(12)
+#define                NFI_FSM_CUSTDATA        (0xe << 16)
+#define                NFI_FSM_MASK            (0xf << 16)
+#define NFI_ADDRCNTR           (0x70)
+#define                CNTR_MASK               GENMASK(16, 12)
+#define NFI_STRADDR            (0x80)
+#define NFI_BYTELEN            (0x84)
+#define NFI_CSEL               (0x90)
+#define NFI_FDML(x)            (0xA0 + (x) * sizeof(u32) * 2)
+#define NFI_FDMM(x)            (0xA4 + (x) * sizeof(u32) * 2)
+#define NFI_FDM_MAX_SIZE       (8)
+#define NFI_FDM_MIN_SIZE       (1)
+#define NFI_MASTER_STA         (0x224)
+#define                MASTER_STA_MASK         (0x0FFF)
+#define NFI_EMPTY_THRESH       (0x23C)
+
+#define MTK_NAME               "mtk-nand"
+#define KB(x)                  ((x) * 1024UL)
+#define MB(x)                  (KB(x) * 1024UL)
+
+#define MTK_TIMEOUT            (500000)
+#define MTK_RESET_TIMEOUT      (1000000)
+#define MTK_MAX_SECTOR         (16)
+#define MTK_NAND_MAX_NSELS     (2)
+
+struct mtk_nfc_bad_mark_ctl {
+       void (*bm_swap)(struct mtd_info *, u8 *buf, int raw);
+       u32 sec;
+       u32 pos;
+};
+
+/*
+ * FDM: region used to store free OOB data
+ */
+struct mtk_nfc_fdm {
+       u32 reg_size;
+       u32 ecc_size;
+};
+
+struct mtk_nfc_nand_chip {
+       struct list_head node;
+       struct nand_chip nand;
+
+       struct mtk_nfc_bad_mark_ctl bad_mark;
+       struct mtk_nfc_fdm fdm;
+       u32 spare_per_sector;
+
+       int nsels;
+       u8 sels[0];
+       /* nothing after this field */
+};
+
+struct mtk_nfc_clk {
+       struct clk *nfi_clk;
+       struct clk *pad_clk;
+};
+
+struct mtk_nfc {
+       struct nand_hw_control controller;
+       struct mtk_ecc_config ecc_cfg;
+       struct mtk_nfc_clk clk;
+       struct mtk_ecc *ecc;
+
+       struct device *dev;
+       void __iomem *regs;
+
+       struct completion done;
+       struct list_head chips;
+
+       u8 *buffer;
+};
+
+static inline struct mtk_nfc_nand_chip *to_mtk_nand(struct nand_chip *nand)
+{
+       return container_of(nand, struct mtk_nfc_nand_chip, nand);
+}
+
+static inline u8 *data_ptr(struct nand_chip *chip, const u8 *p, int i)
+{
+       return (u8 *)p + i * chip->ecc.size;
+}
+
+static inline u8 *oob_ptr(struct nand_chip *chip, int i)
+{
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       u8 *poi;
+
+       /* map the sector's FDM data to free oob:
+        * the beginning of the oob area stores the FDM data of bad mark sectors
+        */
+
+       if (i < mtk_nand->bad_mark.sec)
+               poi = chip->oob_poi + (i + 1) * mtk_nand->fdm.reg_size;
+       else if (i == mtk_nand->bad_mark.sec)
+               poi = chip->oob_poi;
+       else
+               poi = chip->oob_poi + i * mtk_nand->fdm.reg_size;
+
+       return poi;
+}
+
+static inline int mtk_data_len(struct nand_chip *chip)
+{
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+
+       return chip->ecc.size + mtk_nand->spare_per_sector;
+}
+
+static inline u8 *mtk_data_ptr(struct nand_chip *chip,  int i)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+
+       return nfc->buffer + i * mtk_data_len(chip);
+}
+
+static inline u8 *mtk_oob_ptr(struct nand_chip *chip, int i)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+
+       return nfc->buffer + i * mtk_data_len(chip) + chip->ecc.size;
+}
+
+static inline void nfi_writel(struct mtk_nfc *nfc, u32 val, u32 reg)
+{
+       writel(val, nfc->regs + reg);
+}
+
+static inline void nfi_writew(struct mtk_nfc *nfc, u16 val, u32 reg)
+{
+       writew(val, nfc->regs + reg);
+}
+
+static inline void nfi_writeb(struct mtk_nfc *nfc, u8 val, u32 reg)
+{
+       writeb(val, nfc->regs + reg);
+}
+
+static inline u32 nfi_readl(struct mtk_nfc *nfc, u32 reg)
+{
+       return readl_relaxed(nfc->regs + reg);
+}
+
+static inline u16 nfi_readw(struct mtk_nfc *nfc, u32 reg)
+{
+       return readw_relaxed(nfc->regs + reg);
+}
+
+static inline u8 nfi_readb(struct mtk_nfc *nfc, u32 reg)
+{
+       return readb_relaxed(nfc->regs + reg);
+}
+
+static void mtk_nfc_hw_reset(struct mtk_nfc *nfc)
+{
+       struct device *dev = nfc->dev;
+       u32 val;
+       int ret;
+
+       /* reset all registers and force the NFI master to terminate */
+       nfi_writel(nfc, CON_FIFO_FLUSH | CON_NFI_RST, NFI_CON);
+
+       /* wait for the master to finish the last transaction */
+       ret = readl_poll_timeout(nfc->regs + NFI_MASTER_STA, val,
+                                !(val & MASTER_STA_MASK), 50,
+                                MTK_RESET_TIMEOUT);
+       if (ret)
+               dev_warn(dev, "master active in reset [0x%x] = 0x%x\n",
+                        NFI_MASTER_STA, val);
+
+       /* ensure any status register affected by the NFI master is reset */
+       nfi_writel(nfc, CON_FIFO_FLUSH | CON_NFI_RST, NFI_CON);
+       nfi_writew(nfc, STAR_DE, NFI_STRDATA);
+}
+
+static int mtk_nfc_send_command(struct mtk_nfc *nfc, u8 command)
+{
+       struct device *dev = nfc->dev;
+       u32 val;
+       int ret;
+
+       nfi_writel(nfc, command, NFI_CMD);
+
+       ret = readl_poll_timeout_atomic(nfc->regs + NFI_STA, val,
+                                       !(val & STA_CMD), 10,  MTK_TIMEOUT);
+       if (ret) {
+               dev_warn(dev, "nfi core timed out entering command mode\n");
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int mtk_nfc_send_address(struct mtk_nfc *nfc, int addr)
+{
+       struct device *dev = nfc->dev;
+       u32 val;
+       int ret;
+
+       nfi_writel(nfc, addr, NFI_COLADDR);
+       nfi_writel(nfc, 0, NFI_ROWADDR);
+       nfi_writew(nfc, 1, NFI_ADDRNOB);
+
+       ret = readl_poll_timeout_atomic(nfc->regs + NFI_STA, val,
+                                       !(val & STA_ADDR), 10, MTK_TIMEOUT);
+       if (ret) {
+               dev_warn(dev, "nfi core timed out entering address mode\n");
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int mtk_nfc_hw_runtime_config(struct mtd_info *mtd)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       u32 fmt, spare;
+
+       if (!mtd->writesize)
+               return 0;
+
+       spare = mtk_nand->spare_per_sector;
+
+       switch (mtd->writesize) {
+       case 512:
+               fmt = PAGEFMT_512_2K | PAGEFMT_SEC_SEL_512;
+               break;
+       case KB(2):
+               if (chip->ecc.size == 512)
+                       fmt = PAGEFMT_2K_4K | PAGEFMT_SEC_SEL_512;
+               else
+                       fmt = PAGEFMT_512_2K;
+               break;
+       case KB(4):
+               if (chip->ecc.size == 512)
+                       fmt = PAGEFMT_4K_8K | PAGEFMT_SEC_SEL_512;
+               else
+                       fmt = PAGEFMT_2K_4K;
+               break;
+       case KB(8):
+               if (chip->ecc.size == 512)
+                       fmt = PAGEFMT_8K_16K | PAGEFMT_SEC_SEL_512;
+               else
+                       fmt = PAGEFMT_4K_8K;
+               break;
+       case KB(16):
+               fmt = PAGEFMT_8K_16K;
+               break;
+       default:
+               dev_err(nfc->dev, "invalid page len: %d\n", mtd->writesize);
+               return -EINVAL;
+       }
+
+       /*
+        * the hardware will double the value for this eccsize, so we need to
+        * halve it
+        */
+       if (chip->ecc.size == 1024)
+               spare >>= 1;
+
+       switch (spare) {
+       case 16:
+               fmt |= (PAGEFMT_SPARE_16 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 26:
+               fmt |= (PAGEFMT_SPARE_26 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 27:
+               fmt |= (PAGEFMT_SPARE_27 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 28:
+               fmt |= (PAGEFMT_SPARE_28 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 32:
+               fmt |= (PAGEFMT_SPARE_32 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 36:
+               fmt |= (PAGEFMT_SPARE_36 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 40:
+               fmt |= (PAGEFMT_SPARE_40 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 44:
+               fmt |= (PAGEFMT_SPARE_44 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 48:
+               fmt |= (PAGEFMT_SPARE_48 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 49:
+               fmt |= (PAGEFMT_SPARE_49 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 50:
+               fmt |= (PAGEFMT_SPARE_50 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 51:
+               fmt |= (PAGEFMT_SPARE_51 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 52:
+               fmt |= (PAGEFMT_SPARE_52 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 62:
+               fmt |= (PAGEFMT_SPARE_62 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 63:
+               fmt |= (PAGEFMT_SPARE_63 << PAGEFMT_SPARE_SHIFT);
+               break;
+       case 64:
+               fmt |= (PAGEFMT_SPARE_64 << PAGEFMT_SPARE_SHIFT);
+               break;
+       default:
+               dev_err(nfc->dev, "invalid spare per sector %d\n", spare);
+               return -EINVAL;
+       }
+
+       fmt |= mtk_nand->fdm.reg_size << PAGEFMT_FDM_SHIFT;
+       fmt |= mtk_nand->fdm.ecc_size << PAGEFMT_FDM_ECC_SHIFT;
+       nfi_writew(nfc, fmt, NFI_PAGEFMT);
+
+       nfc->ecc_cfg.strength = chip->ecc.strength;
+       nfc->ecc_cfg.len = chip->ecc.size + mtk_nand->fdm.ecc_size;
+
+       return 0;
+}
+
+static void mtk_nfc_select_chip(struct mtd_info *mtd, int chip)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct mtk_nfc *nfc = nand_get_controller_data(nand);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(nand);
+
+       if (chip < 0)
+               return;
+
+       mtk_nfc_hw_runtime_config(mtd);
+
+       nfi_writel(nfc, mtk_nand->sels[chip], NFI_CSEL);
+}
+
+static int mtk_nfc_dev_ready(struct mtd_info *mtd)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd));
+
+       if (nfi_readl(nfc, NFI_STA) & STA_BUSY)
+               return 0;
+
+       return 1;
+}
+
+static void mtk_nfc_cmd_ctrl(struct mtd_info *mtd, int dat, unsigned int ctrl)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd));
+
+       if (ctrl & NAND_ALE) {
+               mtk_nfc_send_address(nfc, dat);
+       } else if (ctrl & NAND_CLE) {
+               mtk_nfc_hw_reset(nfc);
+
+               nfi_writew(nfc, CNFG_OP_CUST, NFI_CNFG);
+               mtk_nfc_send_command(nfc, dat);
+       }
+}
+
+static inline void mtk_nfc_wait_ioready(struct mtk_nfc *nfc)
+{
+       int rc;
+       u8 val;
+
+       rc = readb_poll_timeout_atomic(nfc->regs + NFI_PIO_DIRDY, val,
+                                      val & PIO_DI_RDY, 10, MTK_TIMEOUT);
+       if (rc < 0)
+               dev_err(nfc->dev, "data not ready\n");
+}
+
+static inline u8 mtk_nfc_read_byte(struct mtd_info *mtd)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       u32 reg;
+
+       /* after each byte read, the NFI_STA reg is reset by the hardware */
+       reg = nfi_readl(nfc, NFI_STA) & NFI_FSM_MASK;
+       if (reg != NFI_FSM_CUSTDATA) {
+               reg = nfi_readw(nfc, NFI_CNFG);
+               reg |= CNFG_BYTE_RW | CNFG_READ_EN;
+               nfi_writew(nfc, reg, NFI_CNFG);
+
+               /*
+                * set to max sector to allow the HW to continue reading over
+                * unaligned accesses
+                */
+               reg = (MTK_MAX_SECTOR << CON_SEC_SHIFT) | CON_BRD;
+               nfi_writel(nfc, reg, NFI_CON);
+
+               /* trigger to fetch data */
+               nfi_writew(nfc, STAR_EN, NFI_STRDATA);
+       }
+
+       mtk_nfc_wait_ioready(nfc);
+
+       return nfi_readb(nfc, NFI_DATAR);
+}
+
+static void mtk_nfc_read_buf(struct mtd_info *mtd, u8 *buf, int len)
+{
+       int i;
+
+       for (i = 0; i < len; i++)
+               buf[i] = mtk_nfc_read_byte(mtd);
+}
+
+static void mtk_nfc_write_byte(struct mtd_info *mtd, u8 byte)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd));
+       u32 reg;
+
+       reg = nfi_readl(nfc, NFI_STA) & NFI_FSM_MASK;
+
+       if (reg != NFI_FSM_CUSTDATA) {
+               reg = nfi_readw(nfc, NFI_CNFG) | CNFG_BYTE_RW;
+               nfi_writew(nfc, reg, NFI_CNFG);
+
+               reg = MTK_MAX_SECTOR << CON_SEC_SHIFT | CON_BWR;
+               nfi_writel(nfc, reg, NFI_CON);
+
+               nfi_writew(nfc, STAR_EN, NFI_STRDATA);
+       }
+
+       mtk_nfc_wait_ioready(nfc);
+       nfi_writeb(nfc, byte, NFI_DATAW);
+}
+
+static void mtk_nfc_write_buf(struct mtd_info *mtd, const u8 *buf, int len)
+{
+       int i;
+
+       for (i = 0; i < len; i++)
+               mtk_nfc_write_byte(mtd, buf[i]);
+}
+
+static int mtk_nfc_sector_encode(struct nand_chip *chip, u8 *data)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       int size = chip->ecc.size + mtk_nand->fdm.reg_size;
+
+       nfc->ecc_cfg.mode = ECC_DMA_MODE;
+       nfc->ecc_cfg.op = ECC_ENCODE;
+
+       return mtk_ecc_encode(nfc->ecc, &nfc->ecc_cfg, data, size);
+}
+
+static void mtk_nfc_no_bad_mark_swap(struct mtd_info *a, u8 *b, int c)
+{
+       /* nop */
+}
+
+static void mtk_nfc_bad_mark_swap(struct mtd_info *mtd, u8 *buf, int raw)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct mtk_nfc_nand_chip *nand = to_mtk_nand(chip);
+       u32 bad_pos = nand->bad_mark.pos;
+
+       if (raw)
+               bad_pos += nand->bad_mark.sec * mtk_data_len(chip);
+       else
+               bad_pos += nand->bad_mark.sec * chip->ecc.size;
+
+       swap(chip->oob_poi[0], buf[bad_pos]);
+}
+
+static int mtk_nfc_format_subpage(struct mtd_info *mtd, u32 offset,
+                                 u32 len, const u8 *buf)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+       u32 start, end;
+       int i, ret;
+
+       start = offset / chip->ecc.size;
+       end = DIV_ROUND_UP(offset + len, chip->ecc.size);
+
+       memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize);
+       for (i = 0; i < chip->ecc.steps; i++) {
+               memcpy(mtk_data_ptr(chip, i), data_ptr(chip, buf, i),
+                      chip->ecc.size);
+
+               if (start > i || i >= end)
+                       continue;
+
+               if (i == mtk_nand->bad_mark.sec)
+                       mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1);
+
+               memcpy(mtk_oob_ptr(chip, i), oob_ptr(chip, i), fdm->reg_size);
+
+               /* program the CRC back to the OOB */
+               ret = mtk_nfc_sector_encode(chip, mtk_data_ptr(chip, i));
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static void mtk_nfc_format_page(struct mtd_info *mtd, const u8 *buf)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+       u32 i;
+
+       memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize);
+       for (i = 0; i < chip->ecc.steps; i++) {
+               if (buf)
+                       memcpy(mtk_data_ptr(chip, i), data_ptr(chip, buf, i),
+                              chip->ecc.size);
+
+               if (i == mtk_nand->bad_mark.sec)
+                       mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1);
+
+               memcpy(mtk_oob_ptr(chip, i), oob_ptr(chip, i), fdm->reg_size);
+       }
+}
+
+static inline void mtk_nfc_read_fdm(struct nand_chip *chip, u32 start,
+                                   u32 sectors)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+       u32 vall, valm;
+       u8 *oobptr;
+       int i, j;
+
+       for (i = 0; i < sectors; i++) {
+               oobptr = oob_ptr(chip, start + i);
+               vall = nfi_readl(nfc, NFI_FDML(i));
+               valm = nfi_readl(nfc, NFI_FDMM(i));
+
+               for (j = 0; j < fdm->reg_size; j++)
+                       oobptr[j] = (j >= 4 ? valm : vall) >> ((j % 4) * 8);
+       }
+}
+
+static inline void mtk_nfc_write_fdm(struct nand_chip *chip)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+       u32 vall, valm;
+       u8 *oobptr;
+       int i, j;
+
+       for (i = 0; i < chip->ecc.steps; i++) {
+               oobptr = oob_ptr(chip, i);
+               vall = 0;
+               valm = 0;
+               for (j = 0; j < 8; j++) {
+                       if (j < 4)
+                               vall |= (j < fdm->reg_size ? oobptr[j] : 0xff)
+                                               << (j * 8);
+                       else
+                               valm |= (j < fdm->reg_size ? oobptr[j] : 0xff)
+                                               << ((j - 4) * 8);
+               }
+               nfi_writel(nfc, vall, NFI_FDML(i));
+               nfi_writel(nfc, valm, NFI_FDMM(i));
+       }
+}
+
+static int mtk_nfc_do_write_page(struct mtd_info *mtd, struct nand_chip *chip,
+                                const u8 *buf, int page, int len)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct device *dev = nfc->dev;
+       dma_addr_t addr;
+       u32 reg;
+       int ret;
+
+       addr = dma_map_single(dev, (void *)buf, len, DMA_TO_DEVICE);
+       ret = dma_mapping_error(nfc->dev, addr);
+       if (ret) {
+               dev_err(nfc->dev, "dma mapping error\n");
+               return -EINVAL;
+       }
+
+       reg = nfi_readw(nfc, NFI_CNFG) | CNFG_AHB | CNFG_DMA_BURST_EN;
+       nfi_writew(nfc, reg, NFI_CNFG);
+
+       nfi_writel(nfc, chip->ecc.steps << CON_SEC_SHIFT, NFI_CON);
+       nfi_writel(nfc, lower_32_bits(addr), NFI_STRADDR);
+       nfi_writew(nfc, INTR_AHB_DONE_EN, NFI_INTR_EN);
+
+       init_completion(&nfc->done);
+
+       reg = nfi_readl(nfc, NFI_CON) | CON_BWR;
+       nfi_writel(nfc, reg, NFI_CON);
+       nfi_writew(nfc, STAR_EN, NFI_STRDATA);
+
+       ret = wait_for_completion_timeout(&nfc->done, msecs_to_jiffies(500));
+       if (!ret) {
+               dev_err(dev, "program ahb done timeout\n");
+               nfi_writew(nfc, 0, NFI_INTR_EN);
+               ret = -ETIMEDOUT;
+               goto timeout;
+       }
+
+       ret = readl_poll_timeout_atomic(nfc->regs + NFI_ADDRCNTR, reg,
+                                       (reg & CNTR_MASK) >= chip->ecc.steps,
+                                       10, MTK_TIMEOUT);
+       if (ret)
+               dev_err(dev, "hwecc write timeout\n");
+
+timeout:
+
+       dma_unmap_single(nfc->dev, addr, len, DMA_TO_DEVICE);
+       nfi_writel(nfc, 0, NFI_CON);
+
+       return ret;
+}
+
+static int mtk_nfc_write_page(struct mtd_info *mtd, struct nand_chip *chip,
+                             const u8 *buf, int page, int raw)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       size_t len;
+       const u8 *bufpoi;
+       u32 reg;
+       int ret;
+
+       if (!raw) {
+               /* OOB => FDM: from register,  ECC: from HW */
+               reg = nfi_readw(nfc, NFI_CNFG) | CNFG_AUTO_FMT_EN;
+               nfi_writew(nfc, reg | CNFG_HW_ECC_EN, NFI_CNFG);
+
+               nfc->ecc_cfg.op = ECC_ENCODE;
+               nfc->ecc_cfg.mode = ECC_NFI_MODE;
+               ret = mtk_ecc_enable(nfc->ecc, &nfc->ecc_cfg);
+               if (ret) {
+                       /* clear NFI config */
+                       reg = nfi_readw(nfc, NFI_CNFG);
+                       reg &= ~(CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN);
+                       nfi_writew(nfc, reg, NFI_CNFG);
+
+                       return ret;
+               }
+
+               memcpy(nfc->buffer, buf, mtd->writesize);
+               mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, raw);
+               bufpoi = nfc->buffer;
+
+               /* write OOB into the FDM registers (OOB area in MTK NAND) */
+               mtk_nfc_write_fdm(chip);
+       } else {
+               bufpoi = buf;
+       }
+
+       len = mtd->writesize + (raw ? mtd->oobsize : 0);
+       ret = mtk_nfc_do_write_page(mtd, chip, bufpoi, page, len);
+
+       if (!raw)
+               mtk_ecc_disable(nfc->ecc);
+
+       return ret;
+}
+
+static int mtk_nfc_write_page_hwecc(struct mtd_info *mtd,
+                                   struct nand_chip *chip, const u8 *buf,
+                                   int oob_on, int page)
+{
+       return mtk_nfc_write_page(mtd, chip, buf, page, 0);
+}
+
+static int mtk_nfc_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
+                                 const u8 *buf, int oob_on, int pg)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+
+       mtk_nfc_format_page(mtd, buf);
+       return mtk_nfc_write_page(mtd, chip, nfc->buffer, pg, 1);
+}
+
+static int mtk_nfc_write_subpage_hwecc(struct mtd_info *mtd,
+                                      struct nand_chip *chip, u32 offset,
+                                      u32 data_len, const u8 *buf,
+                                      int oob_on, int page)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       int ret;
+
+       ret = mtk_nfc_format_subpage(mtd, offset, data_len, buf);
+       if (ret < 0)
+               return ret;
+
+       /* use the data in the private buffer (now with FDM and CRC) */
+       return mtk_nfc_write_page(mtd, chip, nfc->buffer, page, 1);
+}
+
+static int mtk_nfc_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
+                                int page)
+{
+       int ret;
+
+       chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0x00, page);
+
+       ret = mtk_nfc_write_page_raw(mtd, chip, NULL, 1, page);
+       if (ret < 0)
+               return -EIO;
+
+       chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
+       ret = chip->waitfunc(mtd, chip);
+
+       return ret & NAND_STATUS_FAIL ? -EIO : 0;
+}
+
+static int mtk_nfc_update_ecc_stats(struct mtd_info *mtd, u8 *buf, u32 sectors)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       struct mtk_ecc_stats stats;
+       int rc, i;
+
+       rc = nfi_readl(nfc, NFI_STA) & STA_EMP_PAGE;
+       if (rc) {
+               memset(buf, 0xff, sectors * chip->ecc.size);
+               for (i = 0; i < sectors; i++)
+                       memset(oob_ptr(chip, i), 0xff, mtk_nand->fdm.reg_size);
+               return 0;
+       }
+
+       mtk_ecc_get_stats(nfc->ecc, &stats, sectors);
+       mtd->ecc_stats.corrected += stats.corrected;
+       mtd->ecc_stats.failed += stats.failed;
+
+       return stats.bitflips;
+}
+
+static int mtk_nfc_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
+                               u32 data_offs, u32 readlen,
+                               u8 *bufpoi, int page, int raw)
+{
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       u32 spare = mtk_nand->spare_per_sector;
+       u32 column, sectors, start, end, reg;
+       dma_addr_t addr;
+       int bitflips;
+       size_t len;
+       u8 *buf;
+       int rc;
+
+       start = data_offs / chip->ecc.size;
+       end = DIV_ROUND_UP(data_offs + readlen, chip->ecc.size);
+
+       sectors = end - start;
+       column = start * (chip->ecc.size + spare);
+
+       len = sectors * chip->ecc.size + (raw ? sectors * spare : 0);
+       buf = bufpoi + start * chip->ecc.size;
+
+       if (column != 0)
+               chip->cmdfunc(mtd, NAND_CMD_RNDOUT, column, -1);
+
+       addr = dma_map_single(nfc->dev, buf, len, DMA_FROM_DEVICE);
+       rc = dma_mapping_error(nfc->dev, addr);
+       if (rc) {
+               dev_err(nfc->dev, "dma mapping error\n");
+
+               return -EINVAL;
+       }
+
+       reg = nfi_readw(nfc, NFI_CNFG);
+       reg |= CNFG_READ_EN | CNFG_DMA_BURST_EN | CNFG_AHB;
+       if (!raw) {
+               reg |= CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN;
+               nfi_writew(nfc, reg, NFI_CNFG);
+
+               nfc->ecc_cfg.mode = ECC_NFI_MODE;
+               nfc->ecc_cfg.sectors = sectors;
+               nfc->ecc_cfg.op = ECC_DECODE;
+               rc = mtk_ecc_enable(nfc->ecc, &nfc->ecc_cfg);
+               if (rc) {
+                       dev_err(nfc->dev, "ecc enable\n");
+                       /* clear NFI_CNFG */
+                       reg &= ~(CNFG_DMA_BURST_EN | CNFG_AHB | CNFG_READ_EN |
+                               CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN);
+                       nfi_writew(nfc, reg, NFI_CNFG);
+                       dma_unmap_single(nfc->dev, addr, len, DMA_FROM_DEVICE);
+
+                       return rc;
+               }
+       } else {
+               nfi_writew(nfc, reg, NFI_CNFG);
+       }
+
+       nfi_writel(nfc, sectors << CON_SEC_SHIFT, NFI_CON);
+       nfi_writew(nfc, INTR_AHB_DONE_EN, NFI_INTR_EN);
+       nfi_writel(nfc, lower_32_bits(addr), NFI_STRADDR);
+
+       init_completion(&nfc->done);
+       reg = nfi_readl(nfc, NFI_CON) | CON_BRD;
+       nfi_writel(nfc, reg, NFI_CON);
+       nfi_writew(nfc, STAR_EN, NFI_STRDATA);
+
+       rc = wait_for_completion_timeout(&nfc->done, msecs_to_jiffies(500));
+       if (!rc)
+               dev_warn(nfc->dev, "read ahb/dma done timeout\n");
+
+       rc = readl_poll_timeout_atomic(nfc->regs + NFI_BYTELEN, reg,
+                                      (reg & CNTR_MASK) >= sectors, 10,
+                                      MTK_TIMEOUT);
+       if (rc < 0) {
+               dev_err(nfc->dev, "subpage done timeout\n");
+               bitflips = -EIO;
+       } else {
+               bitflips = 0;
+               if (!raw) {
+                       rc = mtk_ecc_wait_done(nfc->ecc, ECC_DECODE);
+                       bitflips = rc < 0 ? -ETIMEDOUT :
+                               mtk_nfc_update_ecc_stats(mtd, buf, sectors);
+                       mtk_nfc_read_fdm(chip, start, sectors);
+               }
+       }
+
+       dma_unmap_single(nfc->dev, addr, len, DMA_FROM_DEVICE);
+
+       if (raw)
+               goto done;
+
+       mtk_ecc_disable(nfc->ecc);
+
+       if (clamp(mtk_nand->bad_mark.sec, start, end) == mtk_nand->bad_mark.sec)
+               mtk_nand->bad_mark.bm_swap(mtd, bufpoi, raw);
+done:
+       nfi_writel(nfc, 0, NFI_CON);
+
+       return bitflips;
+}
+
+static int mtk_nfc_read_subpage_hwecc(struct mtd_info *mtd,
+                                     struct nand_chip *chip, u32 off,
+                                     u32 len, u8 *p, int pg)
+{
+       return mtk_nfc_read_subpage(mtd, chip, off, len, p, pg, 0);
+}
+
+static int mtk_nfc_read_page_hwecc(struct mtd_info *mtd,
+                                  struct nand_chip *chip, u8 *p,
+                                  int oob_on, int pg)
+{
+       return mtk_nfc_read_subpage(mtd, chip, 0, mtd->writesize, p, pg, 0);
+}
+
+static int mtk_nfc_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
+                                u8 *buf, int oob_on, int page)
+{
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       struct mtk_nfc *nfc = nand_get_controller_data(chip);
+       struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+       int i, ret;
+
+       memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize);
+       ret = mtk_nfc_read_subpage(mtd, chip, 0, mtd->writesize, nfc->buffer,
+                                  page, 1);
+       if (ret < 0)
+               return ret;
+
+       for (i = 0; i < chip->ecc.steps; i++) {
+               memcpy(oob_ptr(chip, i), mtk_oob_ptr(chip, i), fdm->reg_size);
+
+               if (i == mtk_nand->bad_mark.sec)
+                       mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1);
+
+               if (buf)
+                       memcpy(data_ptr(chip, buf, i), mtk_data_ptr(chip, i),
+                              chip->ecc.size);
+       }
+
+       return ret;
+}
+
+static int mtk_nfc_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
+                               int page)
+{
+       chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
+
+       return mtk_nfc_read_page_raw(mtd, chip, NULL, 1, page);
+}
+
+static inline void mtk_nfc_hw_init(struct mtk_nfc *nfc)
+{
+       /*
+        * ACCON: access timing control register
+        * -------------------------------------
+        * 31:28: minimum required time for CS post pulling down after accessing
+        *      the device
+        * 27:22: minimum required time for CS pre pulling down before accessing
+        *      the device
+        * 21:16: minimum required time from NCEB low to NREB low
+        * 15:12: minimum required time from NWEB high to NREB low.
+        * 11:08: write enable hold time
+        * 07:04: write wait states
+        * 03:00: read wait states
+        */
+       nfi_writel(nfc, 0x10804211, NFI_ACCCON);
+
+       /*
+        * CNRNB: nand ready/busy register
+        * -------------------------------
+        * 7:4: timeout register for polling the NAND busy/ready signal
+        * 0  : poll the status of the busy/ready signal after [7:4]*16 cycles.
+        */
+       nfi_writew(nfc, 0xf1, NFI_CNRNB);
+       nfi_writew(nfc, PAGEFMT_8K_16K, NFI_PAGEFMT);
+
+       mtk_nfc_hw_reset(nfc);
+
+       nfi_readl(nfc, NFI_INTR_STA);
+       nfi_writel(nfc, 0, NFI_INTR_EN);
+}
+
+static irqreturn_t mtk_nfc_irq(int irq, void *id)
+{
+       struct mtk_nfc *nfc = id;
+       u16 sta, ien;
+
+       sta = nfi_readw(nfc, NFI_INTR_STA);
+       ien = nfi_readw(nfc, NFI_INTR_EN);
+
+       if (!(sta & ien))
+               return IRQ_NONE;
+
+       nfi_writew(nfc, ~sta & ien, NFI_INTR_EN);
+       complete(&nfc->done);
+
+       return IRQ_HANDLED;
+}
+
+static int mtk_nfc_enable_clk(struct device *dev, struct mtk_nfc_clk *clk)
+{
+       int ret;
+
+       ret = clk_prepare_enable(clk->nfi_clk);
+       if (ret) {
+               dev_err(dev, "failed to enable nfi clk\n");
+               return ret;
+       }
+
+       ret = clk_prepare_enable(clk->pad_clk);
+       if (ret) {
+               dev_err(dev, "failed to enable pad clk\n");
+               clk_disable_unprepare(clk->nfi_clk);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void mtk_nfc_disable_clk(struct mtk_nfc_clk *clk)
+{
+       clk_disable_unprepare(clk->nfi_clk);
+       clk_disable_unprepare(clk->pad_clk);
+}
+
+static int mtk_nfc_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oob_region)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+       u32 eccsteps;
+
+       eccsteps = mtd->writesize / chip->ecc.size;
+
+       if (section >= eccsteps)
+               return -ERANGE;
+
+       oob_region->length = fdm->reg_size - fdm->ecc_size;
+       oob_region->offset = section * fdm->reg_size + fdm->ecc_size;
+
+       return 0;
+}
+
+static int mtk_nfc_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oob_region)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+       u32 eccsteps;
+
+       if (section)
+               return -ERANGE;
+
+       eccsteps = mtd->writesize / chip->ecc.size;
+       oob_region->offset = mtk_nand->fdm.reg_size * eccsteps;
+       oob_region->length = mtd->oobsize - oob_region->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops mtk_nfc_ooblayout_ops = {
+       .free = mtk_nfc_ooblayout_free,
+       .ecc = mtk_nfc_ooblayout_ecc,
+};
+
+static void mtk_nfc_set_fdm(struct mtk_nfc_fdm *fdm, struct mtd_info *mtd)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct mtk_nfc_nand_chip *chip = to_mtk_nand(nand);
+       u32 ecc_bytes;
+
+       ecc_bytes = DIV_ROUND_UP(nand->ecc.strength * ECC_PARITY_BITS, 8);
+
+       fdm->reg_size = chip->spare_per_sector - ecc_bytes;
+       if (fdm->reg_size > NFI_FDM_MAX_SIZE)
+               fdm->reg_size = NFI_FDM_MAX_SIZE;
+
+       /* bad block mark storage */
+       fdm->ecc_size = 1;
+}
+
+static void mtk_nfc_set_bad_mark_ctl(struct mtk_nfc_bad_mark_ctl *bm_ctl,
+                                    struct mtd_info *mtd)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+
+       if (mtd->writesize == 512) {
+               bm_ctl->bm_swap = mtk_nfc_no_bad_mark_swap;
+       } else {
+               bm_ctl->bm_swap = mtk_nfc_bad_mark_swap;
+               bm_ctl->sec = mtd->writesize / mtk_data_len(nand);
+               bm_ctl->pos = mtd->writesize % mtk_data_len(nand);
+       }
+}
+
+static void mtk_nfc_set_spare_per_sector(u32 *sps, struct mtd_info *mtd)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       u32 spare[] = {16, 26, 27, 28, 32, 36, 40, 44,
+                       48, 49, 50, 51, 52, 62, 63, 64};
+       u32 eccsteps, i;
+
+       eccsteps = mtd->writesize / nand->ecc.size;
+       *sps = mtd->oobsize / eccsteps;
+
+       if (nand->ecc.size == 1024)
+               *sps >>= 1;
+
+       for (i = 0; i < ARRAY_SIZE(spare); i++) {
+               if (*sps <= spare[i]) {
+                       if (!i)
+                               *sps = spare[i];
+                       else if (*sps != spare[i])
+                               *sps = spare[i - 1];
+                       break;
+               }
+       }
+
+       if (i >= ARRAY_SIZE(spare))
+               *sps = spare[ARRAY_SIZE(spare) - 1];
+
+       if (nand->ecc.size == 1024)
+               *sps <<= 1;
+}
+
+static int mtk_nfc_ecc_init(struct device *dev, struct mtd_info *mtd)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       u32 spare;
+       int free;
+
+       /* support only ecc hw mode */
+       if (nand->ecc.mode != NAND_ECC_HW) {
+               dev_err(dev, "ecc.mode not supported\n");
+               return -EINVAL;
+       }
+
+       /* if optional dt settings not present */
+       if (!nand->ecc.size || !nand->ecc.strength) {
+               /* use datasheet requirements */
+               nand->ecc.strength = nand->ecc_strength_ds;
+               nand->ecc.size = nand->ecc_step_ds;
+
+               /*
+                * align eccstrength and eccsize
+                * this controller only supports 512 and 1024 sizes
+                */
+               if (nand->ecc.size < 1024) {
+                       if (mtd->writesize > 512) {
+                               nand->ecc.size = 1024;
+                               nand->ecc.strength <<= 1;
+                       } else {
+                               nand->ecc.size = 512;
+                       }
+               } else {
+                       nand->ecc.size = 1024;
+               }
+
+               mtk_nfc_set_spare_per_sector(&spare, mtd);
+
+               /* calculate oob bytes except ecc parity data */
+               free = ((nand->ecc.strength * ECC_PARITY_BITS) + 7) >> 3;
+               free = spare - free;
+
+               /*
+                * enhance ecc strength if oob left is bigger than max FDM size
+                * or reduce ecc strength if oob size is not enough for ecc
+                * parity data.
+                */
+               if (free > NFI_FDM_MAX_SIZE) {
+                       spare -= NFI_FDM_MAX_SIZE;
+                       nand->ecc.strength = (spare << 3) / ECC_PARITY_BITS;
+               } else if (free < 0) {
+                       spare -= NFI_FDM_MIN_SIZE;
+                       nand->ecc.strength = (spare << 3) / ECC_PARITY_BITS;
+               }
+       }
+
+       mtk_ecc_adjust_strength(&nand->ecc.strength);
+
+       dev_info(dev, "eccsize %d eccstrength %d\n",
+                nand->ecc.size, nand->ecc.strength);
+
+       return 0;
+}
+
+static int mtk_nfc_nand_chip_init(struct device *dev, struct mtk_nfc *nfc,
+                                 struct device_node *np)
+{
+       struct mtk_nfc_nand_chip *chip;
+       struct nand_chip *nand;
+       struct mtd_info *mtd;
+       int nsels, len;
+       u32 tmp;
+       int ret;
+       int i;
+
+       if (!of_get_property(np, "reg", &nsels))
+               return -ENODEV;
+
+       nsels /= sizeof(u32);
+       if (!nsels || nsels > MTK_NAND_MAX_NSELS) {
+               dev_err(dev, "invalid reg property size %d\n", nsels);
+               return -EINVAL;
+       }
+
+       chip = devm_kzalloc(dev, sizeof(*chip) + nsels * sizeof(u8),
+                           GFP_KERNEL);
+       if (!chip)
+               return -ENOMEM;
+
+       chip->nsels = nsels;
+       for (i = 0; i < nsels; i++) {
+               ret = of_property_read_u32_index(np, "reg", i, &tmp);
+               if (ret) {
+                       dev_err(dev, "reg property failure : %d\n", ret);
+                       return ret;
+               }
+               chip->sels[i] = tmp;
+       }
+
+       nand = &chip->nand;
+       nand->controller = &nfc->controller;
+
+       nand_set_flash_node(nand, np);
+       nand_set_controller_data(nand, nfc);
+
+       nand->options |= NAND_USE_BOUNCE_BUFFER | NAND_SUBPAGE_READ;
+       nand->dev_ready = mtk_nfc_dev_ready;
+       nand->select_chip = mtk_nfc_select_chip;
+       nand->write_byte = mtk_nfc_write_byte;
+       nand->write_buf = mtk_nfc_write_buf;
+       nand->read_byte = mtk_nfc_read_byte;
+       nand->read_buf = mtk_nfc_read_buf;
+       nand->cmd_ctrl = mtk_nfc_cmd_ctrl;
+
+       /* set default mode in case dt entry is missing */
+       nand->ecc.mode = NAND_ECC_HW;
+
+       nand->ecc.write_subpage = mtk_nfc_write_subpage_hwecc;
+       nand->ecc.write_page_raw = mtk_nfc_write_page_raw;
+       nand->ecc.write_page = mtk_nfc_write_page_hwecc;
+       nand->ecc.write_oob_raw = mtk_nfc_write_oob_std;
+       nand->ecc.write_oob = mtk_nfc_write_oob_std;
+
+       nand->ecc.read_subpage = mtk_nfc_read_subpage_hwecc;
+       nand->ecc.read_page_raw = mtk_nfc_read_page_raw;
+       nand->ecc.read_page = mtk_nfc_read_page_hwecc;
+       nand->ecc.read_oob_raw = mtk_nfc_read_oob_std;
+       nand->ecc.read_oob = mtk_nfc_read_oob_std;
+
+       mtd = nand_to_mtd(nand);
+       mtd->owner = THIS_MODULE;
+       mtd->dev.parent = dev;
+       mtd->name = MTK_NAME;
+       mtd_set_ooblayout(mtd, &mtk_nfc_ooblayout_ops);
+
+       mtk_nfc_hw_init(nfc);
+
+       ret = nand_scan_ident(mtd, nsels, NULL);
+       if (ret)
+               return -ENODEV;
+
+       /* store bbt magic in page, cause OOB is not protected */
+       if (nand->bbt_options & NAND_BBT_USE_FLASH)
+               nand->bbt_options |= NAND_BBT_NO_OOB;
+
+       ret = mtk_nfc_ecc_init(dev, mtd);
+       if (ret)
+               return -EINVAL;
+
+       if (nand->options & NAND_BUSWIDTH_16) {
+               dev_err(dev, "16bits buswidth not supported");
+               return -EINVAL;
+       }
+
+       mtk_nfc_set_spare_per_sector(&chip->spare_per_sector, mtd);
+       mtk_nfc_set_fdm(&chip->fdm, mtd);
+       mtk_nfc_set_bad_mark_ctl(&chip->bad_mark, mtd);
+
+       len = mtd->writesize + mtd->oobsize;
+       nfc->buffer = devm_kzalloc(dev, len, GFP_KERNEL);
+       if (!nfc->buffer)
+               return  -ENOMEM;
+
+       ret = nand_scan_tail(mtd);
+       if (ret)
+               return -ENODEV;
+
+       ret = mtd_device_parse_register(mtd, NULL, NULL, NULL, 0);
+       if (ret) {
+               dev_err(dev, "mtd parse partition error\n");
+               nand_release(mtd);
+               return ret;
+       }
+
+       list_add_tail(&chip->node, &nfc->chips);
+
+       return 0;
+}
+
+static int mtk_nfc_nand_chips_init(struct device *dev, struct mtk_nfc *nfc)
+{
+       struct device_node *np = dev->of_node;
+       struct device_node *nand_np;
+       int ret;
+
+       for_each_child_of_node(np, nand_np) {
+               ret = mtk_nfc_nand_chip_init(dev, nfc, nand_np);
+               if (ret) {
+                       of_node_put(nand_np);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int mtk_nfc_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct device_node *np = dev->of_node;
+       struct mtk_nfc *nfc;
+       struct resource *res;
+       int ret, irq;
+
+       nfc = devm_kzalloc(dev, sizeof(*nfc), GFP_KERNEL);
+       if (!nfc)
+               return -ENOMEM;
+
+       spin_lock_init(&nfc->controller.lock);
+       init_waitqueue_head(&nfc->controller.wq);
+       INIT_LIST_HEAD(&nfc->chips);
+
+       /* probe defer if not ready */
+       nfc->ecc = of_mtk_ecc_get(np);
+       if (IS_ERR(nfc->ecc))
+               return PTR_ERR(nfc->ecc);
+       else if (!nfc->ecc)
+               return -ENODEV;
+
+       nfc->dev = dev;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       nfc->regs = devm_ioremap_resource(dev, res);
+       if (IS_ERR(nfc->regs)) {
+               ret = PTR_ERR(nfc->regs);
+               dev_err(dev, "no nfi base\n");
+               goto release_ecc;
+       }
+
+       nfc->clk.nfi_clk = devm_clk_get(dev, "nfi_clk");
+       if (IS_ERR(nfc->clk.nfi_clk)) {
+               dev_err(dev, "no clk\n");
+               ret = PTR_ERR(nfc->clk.nfi_clk);
+               goto release_ecc;
+       }
+
+       nfc->clk.pad_clk = devm_clk_get(dev, "pad_clk");
+       if (IS_ERR(nfc->clk.pad_clk)) {
+               dev_err(dev, "no pad clk\n");
+               ret = PTR_ERR(nfc->clk.pad_clk);
+               goto release_ecc;
+       }
+
+       ret = mtk_nfc_enable_clk(dev, &nfc->clk);
+       if (ret)
+               goto release_ecc;
+
+       irq = platform_get_irq(pdev, 0);
+       if (irq < 0) {
+               dev_err(dev, "no nfi irq resource\n");
+               ret = -EINVAL;
+               goto clk_disable;
+       }
+
+       ret = devm_request_irq(dev, irq, mtk_nfc_irq, 0x0, "mtk-nand", nfc);
+       if (ret) {
+               dev_err(dev, "failed to request nfi irq\n");
+               goto clk_disable;
+       }
+
+       ret = dma_set_mask(dev, DMA_BIT_MASK(32));
+       if (ret) {
+               dev_err(dev, "failed to set dma mask\n");
+               goto clk_disable;
+       }
+
+       platform_set_drvdata(pdev, nfc);
+
+       ret = mtk_nfc_nand_chips_init(dev, nfc);
+       if (ret) {
+               dev_err(dev, "failed to init nand chips\n");
+               goto clk_disable;
+       }
+
+       return 0;
+
+clk_disable:
+       mtk_nfc_disable_clk(&nfc->clk);
+
+release_ecc:
+       mtk_ecc_release(nfc->ecc);
+
+       return ret;
+}
+
+static int mtk_nfc_remove(struct platform_device *pdev)
+{
+       struct mtk_nfc *nfc = platform_get_drvdata(pdev);
+       struct mtk_nfc_nand_chip *chip;
+
+       while (!list_empty(&nfc->chips)) {
+               chip = list_first_entry(&nfc->chips, struct mtk_nfc_nand_chip,
+                                       node);
+               nand_release(nand_to_mtd(&chip->nand));
+               list_del(&chip->node);
+       }
+
+       mtk_ecc_release(nfc->ecc);
+       mtk_nfc_disable_clk(&nfc->clk);
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int mtk_nfc_suspend(struct device *dev)
+{
+       struct mtk_nfc *nfc = dev_get_drvdata(dev);
+
+       mtk_nfc_disable_clk(&nfc->clk);
+
+       return 0;
+}
+
+static int mtk_nfc_resume(struct device *dev)
+{
+       struct mtk_nfc *nfc = dev_get_drvdata(dev);
+       struct mtk_nfc_nand_chip *chip;
+       struct nand_chip *nand;
+       struct mtd_info *mtd;
+       int ret;
+       u32 i;
+
+       udelay(200);
+
+       ret = mtk_nfc_enable_clk(dev, &nfc->clk);
+       if (ret)
+               return ret;
+
+       mtk_nfc_hw_init(nfc);
+
+       /* reset NAND chip if VCC was powered off */
+       list_for_each_entry(chip, &nfc->chips, node) {
+               nand = &chip->nand;
+               mtd = nand_to_mtd(nand);
+               for (i = 0; i < chip->nsels; i++) {
+                       nand->select_chip(mtd, i);
+                       nand->cmdfunc(mtd, NAND_CMD_RESET, -1, -1);
+               }
+       }
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(mtk_nfc_pm_ops, mtk_nfc_suspend, mtk_nfc_resume);
+#endif
+
+static const struct of_device_id mtk_nfc_id_table[] = {
+       { .compatible = "mediatek,mt2701-nfc" },
+       {}
+};
+MODULE_DEVICE_TABLE(of, mtk_nfc_id_table);
+
+static struct platform_driver mtk_nfc_driver = {
+       .probe  = mtk_nfc_probe,
+       .remove = mtk_nfc_remove,
+       .driver = {
+               .name  = MTK_NAME,
+               .of_match_table = mtk_nfc_id_table,
+#ifdef CONFIG_PM_SLEEP
+               .pm = &mtk_nfc_pm_ops,
+#endif
+       },
+};
+
+module_platform_driver(mtk_nfc_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Xiaolei Li <xiaolei.li@mediatek.com>");
+MODULE_DESCRIPTION("MTK Nand Flash Controller Driver");
index 0b0dc29..77533f7 100644 (file)
@@ -2610,7 +2610,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
                int cached = writelen > bytes && page != blockmask;
                uint8_t *wbuf = buf;
                int use_bufpoi;
-               int part_pagewr = (column || writelen < (mtd->writesize - 1));
+               int part_pagewr = (column || writelen < mtd->writesize);
 
                if (part_pagewr)
                        use_bufpoi = 1;
index ccc05f5..2af9869 100644 (file)
@@ -168,6 +168,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 /* Manufacturer IDs */
 struct nand_manufacturers nand_manuf_ids[] = {
        {NAND_MFR_TOSHIBA, "Toshiba"},
+       {NAND_MFR_ESMT, "ESMT"},
        {NAND_MFR_SAMSUNG, "Samsung"},
        {NAND_MFR_FUJITSU, "Fujitsu"},
        {NAND_MFR_NATIONAL, "National"},
index a136da8..a59361c 100644 (file)
 #define        PREFETCH_STATUS_FIFO_CNT(val)   ((val >> 24) & 0x7F)
 #define        STATUS_BUFF_EMPTY               0x00000001
 
-#define OMAP24XX_DMA_GPMC              4
-
 #define SECTOR_BYTES           512
 /* 4 bit padding to make byte aligned, 56 = 52 + 4 */
 #define BCH4_BIT_PAD           4
@@ -1811,7 +1809,6 @@ static int omap_nand_probe(struct platform_device *pdev)
        struct nand_chip                *nand_chip;
        int                             err;
        dma_cap_mask_t                  mask;
-       unsigned                        sig;
        struct resource                 *res;
        struct device                   *dev = &pdev->dev;
        int                             min_oobbytes = BADBLOCK_MARKER_LENGTH;
@@ -1924,11 +1921,11 @@ static int omap_nand_probe(struct platform_device *pdev)
        case NAND_OMAP_PREFETCH_DMA:
                dma_cap_zero(mask);
                dma_cap_set(DMA_SLAVE, mask);
-               sig = OMAP24XX_DMA_GPMC;
-               info->dma = dma_request_channel(mask, omap_dma_filter_fn, &sig);
-               if (!info->dma) {
+               info->dma = dma_request_chan(pdev->dev.parent, "rxtx");
+
+               if (IS_ERR(info->dma)) {
                        dev_err(&pdev->dev, "DMA engine request failed\n");
-                       err = -ENXIO;
+                       err = PTR_ERR(info->dma);
                        goto return_error;
                } else {
                        struct dma_slave_config cfg;
index a83a690..e414b31 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/gpio.h>
 #include <linux/interrupt.h>
 #include <linux/iopoll.h>
+#include <linux/reset.h>
 
 #define NFC_REG_CTL            0x0000
 #define NFC_REG_ST             0x0004
 
 /* define bit use in NFC_ECC_ST */
 #define NFC_ECC_ERR(x)         BIT(x)
+#define NFC_ECC_ERR_MSK                GENMASK(15, 0)
 #define NFC_ECC_PAT_FOUND(x)   BIT(x + 16)
 #define NFC_ECC_ERR_CNT(b, x)  (((x) >> (((b) % 4) * 8)) & 0xff)
 
@@ -269,10 +271,12 @@ struct sunxi_nfc {
        void __iomem *regs;
        struct clk *ahb_clk;
        struct clk *mod_clk;
+       struct reset_control *reset;
        unsigned long assigned_cs;
        unsigned long clk_rate;
        struct list_head chips;
        struct completion complete;
+       struct dma_chan *dmac;
 };
 
 static inline struct sunxi_nfc *to_sunxi_nfc(struct nand_hw_control *ctrl)
@@ -365,6 +369,67 @@ static int sunxi_nfc_rst(struct sunxi_nfc *nfc)
        return ret;
 }
 
+static int sunxi_nfc_dma_op_prepare(struct mtd_info *mtd, const void *buf,
+                                   int chunksize, int nchunks,
+                                   enum dma_data_direction ddir,
+                                   struct scatterlist *sg)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+       struct dma_async_tx_descriptor *dmad;
+       enum dma_transfer_direction tdir;
+       dma_cookie_t dmat;
+       int ret;
+
+       if (ddir == DMA_FROM_DEVICE)
+               tdir = DMA_DEV_TO_MEM;
+       else
+               tdir = DMA_MEM_TO_DEV;
+
+       sg_init_one(sg, buf, nchunks * chunksize);
+       ret = dma_map_sg(nfc->dev, sg, 1, ddir);
+       if (!ret)
+               return -ENOMEM;
+
+       dmad = dmaengine_prep_slave_sg(nfc->dmac, sg, 1, tdir, DMA_CTRL_ACK);
+       if (!dmad) {
+               ret = -EINVAL;
+               goto err_unmap_buf;
+       }
+
+       writel(readl(nfc->regs + NFC_REG_CTL) | NFC_RAM_METHOD,
+              nfc->regs + NFC_REG_CTL);
+       writel(nchunks, nfc->regs + NFC_REG_SECTOR_NUM);
+       writel(chunksize, nfc->regs + NFC_REG_CNT);
+       dmat = dmaengine_submit(dmad);
+
+       ret = dma_submit_error(dmat);
+       if (ret)
+               goto err_clr_dma_flag;
+
+       return 0;
+
+err_clr_dma_flag:
+       writel(readl(nfc->regs + NFC_REG_CTL) & ~NFC_RAM_METHOD,
+              nfc->regs + NFC_REG_CTL);
+
+err_unmap_buf:
+       dma_unmap_sg(nfc->dev, sg, 1, ddir);
+       return ret;
+}
+
+static void sunxi_nfc_dma_op_cleanup(struct mtd_info *mtd,
+                                    enum dma_data_direction ddir,
+                                    struct scatterlist *sg)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+
+       dma_unmap_sg(nfc->dev, sg, 1, ddir);
+       writel(readl(nfc->regs + NFC_REG_CTL) & ~NFC_RAM_METHOD,
+              nfc->regs + NFC_REG_CTL);
+}
+
 static int sunxi_nfc_dev_ready(struct mtd_info *mtd)
 {
        struct nand_chip *nand = mtd_to_nand(mtd);
@@ -822,17 +887,15 @@ static void sunxi_nfc_hw_ecc_update_stats(struct mtd_info *mtd,
 }
 
 static int sunxi_nfc_hw_ecc_correct(struct mtd_info *mtd, u8 *data, u8 *oob,
-                                   int step, bool *erased)
+                                   int step, u32 status, bool *erased)
 {
        struct nand_chip *nand = mtd_to_nand(mtd);
        struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
        struct nand_ecc_ctrl *ecc = &nand->ecc;
-       u32 status, tmp;
+       u32 tmp;
 
        *erased = false;
 
-       status = readl(nfc->regs + NFC_REG_ECC_ST);
-
        if (status & NFC_ECC_ERR(step))
                return -EBADMSG;
 
@@ -898,6 +961,7 @@ static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd,
        *cur_off = oob_off + ecc->bytes + 4;
 
        ret = sunxi_nfc_hw_ecc_correct(mtd, data, oob_required ? oob : NULL, 0,
+                                      readl(nfc->regs + NFC_REG_ECC_ST),
                                       &erased);
        if (erased)
                return 1;
@@ -967,6 +1031,130 @@ static void sunxi_nfc_hw_ecc_read_extra_oob(struct mtd_info *mtd,
                *cur_off = mtd->oobsize + mtd->writesize;
 }
 
+static int sunxi_nfc_hw_ecc_read_chunks_dma(struct mtd_info *mtd, uint8_t *buf,
+                                           int oob_required, int page,
+                                           int nchunks)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       bool randomized = nand->options & NAND_NEED_SCRAMBLING;
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+       struct nand_ecc_ctrl *ecc = &nand->ecc;
+       unsigned int max_bitflips = 0;
+       int ret, i, raw_mode = 0;
+       struct scatterlist sg;
+       u32 status;
+
+       ret = sunxi_nfc_wait_cmd_fifo_empty(nfc);
+       if (ret)
+               return ret;
+
+       ret = sunxi_nfc_dma_op_prepare(mtd, buf, ecc->size, nchunks,
+                                      DMA_FROM_DEVICE, &sg);
+       if (ret)
+               return ret;
+
+       sunxi_nfc_hw_ecc_enable(mtd);
+       sunxi_nfc_randomizer_config(mtd, page, false);
+       sunxi_nfc_randomizer_enable(mtd);
+
+       writel((NAND_CMD_RNDOUTSTART << 16) | (NAND_CMD_RNDOUT << 8) |
+              NAND_CMD_READSTART, nfc->regs + NFC_REG_RCMD_SET);
+
+       dma_async_issue_pending(nfc->dmac);
+
+       writel(NFC_PAGE_OP | NFC_DATA_SWAP_METHOD | NFC_DATA_TRANS,
+              nfc->regs + NFC_REG_CMD);
+
+       ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
+       if (ret)
+               dmaengine_terminate_all(nfc->dmac);
+
+       sunxi_nfc_randomizer_disable(mtd);
+       sunxi_nfc_hw_ecc_disable(mtd);
+
+       sunxi_nfc_dma_op_cleanup(mtd, DMA_FROM_DEVICE, &sg);
+
+       if (ret)
+               return ret;
+
+       status = readl(nfc->regs + NFC_REG_ECC_ST);
+
+       for (i = 0; i < nchunks; i++) {
+               int data_off = i * ecc->size;
+               int oob_off = i * (ecc->bytes + 4);
+               u8 *data = buf + data_off;
+               u8 *oob = nand->oob_poi + oob_off;
+               bool erased;
+
+               ret = sunxi_nfc_hw_ecc_correct(mtd, randomized ? data : NULL,
+                                              oob_required ? oob : NULL,
+                                              i, status, &erased);
+
+               /* ECC errors are handled in the second loop. */
+               if (ret < 0)
+                       continue;
+
+               if (oob_required && !erased) {
+                       /* TODO: use DMA to retrieve OOB */
+                       nand->cmdfunc(mtd, NAND_CMD_RNDOUT,
+                                     mtd->writesize + oob_off, -1);
+                       nand->read_buf(mtd, oob, ecc->bytes + 4);
+
+                       sunxi_nfc_hw_ecc_get_prot_oob_bytes(mtd, oob, i,
+                                                           !i, page);
+               }
+
+               if (erased)
+                       raw_mode = 1;
+
+               sunxi_nfc_hw_ecc_update_stats(mtd, &max_bitflips, ret);
+       }
+
+       if (status & NFC_ECC_ERR_MSK) {
+               for (i = 0; i < nchunks; i++) {
+                       int data_off = i * ecc->size;
+                       int oob_off = i * (ecc->bytes + 4);
+                       u8 *data = buf + data_off;
+                       u8 *oob = nand->oob_poi + oob_off;
+
+                       if (!(status & NFC_ECC_ERR(i)))
+                               continue;
+
+                       /*
+                        * Re-read the data with the randomizer disabled to
+                        * identify bitflips in erased pages.
+                        */
+                       if (randomized) {
+                               /* TODO: use DMA to read page in raw mode */
+                               nand->cmdfunc(mtd, NAND_CMD_RNDOUT,
+                                             data_off, -1);
+                               nand->read_buf(mtd, data, ecc->size);
+                       }
+
+                       /* TODO: use DMA to retrieve OOB */
+                       nand->cmdfunc(mtd, NAND_CMD_RNDOUT,
+                                     mtd->writesize + oob_off, -1);
+                       nand->read_buf(mtd, oob, ecc->bytes + 4);
+
+                       ret = nand_check_erased_ecc_chunk(data, ecc->size,
+                                                         oob, ecc->bytes + 4,
+                                                         NULL, 0,
+                                                         ecc->strength);
+                       if (ret >= 0)
+                               raw_mode = 1;
+
+                       sunxi_nfc_hw_ecc_update_stats(mtd, &max_bitflips, ret);
+               }
+       }
+
+       if (oob_required)
+               sunxi_nfc_hw_ecc_read_extra_oob(mtd, nand->oob_poi,
+                                               NULL, !raw_mode,
+                                               page);
+
+       return max_bitflips;
+}
+
 static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd,
                                        const u8 *data, int data_off,
                                        const u8 *oob, int oob_off,
@@ -1065,6 +1253,23 @@ static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd,
        return max_bitflips;
 }
 
+static int sunxi_nfc_hw_ecc_read_page_dma(struct mtd_info *mtd,
+                                         struct nand_chip *chip, u8 *buf,
+                                         int oob_required, int page)
+{
+       int ret;
+
+       ret = sunxi_nfc_hw_ecc_read_chunks_dma(mtd, buf, oob_required, page,
+                                              chip->ecc.steps);
+       if (ret >= 0)
+               return ret;
+
+       /* Fallback to PIO mode */
+       chip->cmdfunc(mtd, NAND_CMD_RNDOUT, 0, -1);
+
+       return sunxi_nfc_hw_ecc_read_page(mtd, chip, buf, oob_required, page);
+}
+
 static int sunxi_nfc_hw_ecc_read_subpage(struct mtd_info *mtd,
                                         struct nand_chip *chip,
                                         u32 data_offs, u32 readlen,
@@ -1098,6 +1303,25 @@ static int sunxi_nfc_hw_ecc_read_subpage(struct mtd_info *mtd,
        return max_bitflips;
 }
 
+static int sunxi_nfc_hw_ecc_read_subpage_dma(struct mtd_info *mtd,
+                                            struct nand_chip *chip,
+                                            u32 data_offs, u32 readlen,
+                                            u8 *buf, int page)
+{
+       int nchunks = DIV_ROUND_UP(data_offs + readlen, chip->ecc.size);
+       int ret;
+
+       ret = sunxi_nfc_hw_ecc_read_chunks_dma(mtd, buf, false, page, nchunks);
+       if (ret >= 0)
+               return ret;
+
+       /* Fallback to PIO mode */
+       chip->cmdfunc(mtd, NAND_CMD_RNDOUT, 0, -1);
+
+       return sunxi_nfc_hw_ecc_read_subpage(mtd, chip, data_offs, readlen,
+                                            buf, page);
+}
+
 static int sunxi_nfc_hw_ecc_write_page(struct mtd_info *mtd,
                                       struct nand_chip *chip,
                                       const uint8_t *buf, int oob_required,
@@ -1130,6 +1354,99 @@ static int sunxi_nfc_hw_ecc_write_page(struct mtd_info *mtd,
        return 0;
 }
 
+static int sunxi_nfc_hw_ecc_write_subpage(struct mtd_info *mtd,
+                                         struct nand_chip *chip,
+                                         u32 data_offs, u32 data_len,
+                                         const u8 *buf, int oob_required,
+                                         int page)
+{
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+       int ret, i, cur_off = 0;
+
+       sunxi_nfc_hw_ecc_enable(mtd);
+
+       for (i = data_offs / ecc->size;
+            i < DIV_ROUND_UP(data_offs + data_len, ecc->size); i++) {
+               int data_off = i * ecc->size;
+               int oob_off = i * (ecc->bytes + 4);
+               const u8 *data = buf + data_off;
+               const u8 *oob = chip->oob_poi + oob_off;
+
+               ret = sunxi_nfc_hw_ecc_write_chunk(mtd, data, data_off, oob,
+                                                  oob_off + mtd->writesize,
+                                                  &cur_off, !i, page);
+               if (ret)
+                       return ret;
+       }
+
+       sunxi_nfc_hw_ecc_disable(mtd);
+
+       return 0;
+}
+
+static int sunxi_nfc_hw_ecc_write_page_dma(struct mtd_info *mtd,
+                                          struct nand_chip *chip,
+                                          const u8 *buf,
+                                          int oob_required,
+                                          int page)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+       struct nand_ecc_ctrl *ecc = &nand->ecc;
+       struct scatterlist sg;
+       int ret, i;
+
+       ret = sunxi_nfc_wait_cmd_fifo_empty(nfc);
+       if (ret)
+               return ret;
+
+       ret = sunxi_nfc_dma_op_prepare(mtd, buf, ecc->size, ecc->steps,
+                                      DMA_TO_DEVICE, &sg);
+       if (ret)
+               goto pio_fallback;
+
+       for (i = 0; i < ecc->steps; i++) {
+               const u8 *oob = nand->oob_poi + (i * (ecc->bytes + 4));
+
+               sunxi_nfc_hw_ecc_set_prot_oob_bytes(mtd, oob, i, !i, page);
+       }
+
+       sunxi_nfc_hw_ecc_enable(mtd);
+       sunxi_nfc_randomizer_config(mtd, page, false);
+       sunxi_nfc_randomizer_enable(mtd);
+
+       writel((NAND_CMD_RNDIN << 8) | NAND_CMD_PAGEPROG,
+              nfc->regs + NFC_REG_RCMD_SET);
+
+       dma_async_issue_pending(nfc->dmac);
+
+       writel(NFC_PAGE_OP | NFC_DATA_SWAP_METHOD |
+              NFC_DATA_TRANS | NFC_ACCESS_DIR,
+              nfc->regs + NFC_REG_CMD);
+
+       ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
+       if (ret)
+               dmaengine_terminate_all(nfc->dmac);
+
+       sunxi_nfc_randomizer_disable(mtd);
+       sunxi_nfc_hw_ecc_disable(mtd);
+
+       sunxi_nfc_dma_op_cleanup(mtd, DMA_TO_DEVICE, &sg);
+
+       if (ret)
+               return ret;
+
+       if (oob_required || (chip->options & NAND_NEED_SCRAMBLING))
+               /* TODO: use DMA to transfer extra OOB bytes ? */
+               sunxi_nfc_hw_ecc_write_extra_oob(mtd, chip->oob_poi,
+                                                NULL, page);
+
+       return 0;
+
+pio_fallback:
+       return sunxi_nfc_hw_ecc_write_page(mtd, chip, buf, oob_required, page);
+}
+
 static int sunxi_nfc_hw_syndrome_ecc_read_page(struct mtd_info *mtd,
                                               struct nand_chip *chip,
                                               uint8_t *buf, int oob_required,
@@ -1497,10 +1814,19 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
        int ret;
        int i;
 
+       if (ecc->size != 512 && ecc->size != 1024)
+               return -EINVAL;
+
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
                return -ENOMEM;
 
+       /* Prefer 1k ECC chunk over 512 ones */
+       if (ecc->size == 512 && mtd->writesize > 512) {
+               ecc->size = 1024;
+               ecc->strength *= 2;
+       }
+
        /* Add ECC info retrieval from DT */
        for (i = 0; i < ARRAY_SIZE(strengths); i++) {
                if (ecc->strength <= strengths[i])
@@ -1550,14 +1876,28 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct mtd_info *mtd,
                                       struct nand_ecc_ctrl *ecc,
                                       struct device_node *np)
 {
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller);
        int ret;
 
        ret = sunxi_nand_hw_common_ecc_ctrl_init(mtd, ecc, np);
        if (ret)
                return ret;
 
-       ecc->read_page = sunxi_nfc_hw_ecc_read_page;
-       ecc->write_page = sunxi_nfc_hw_ecc_write_page;
+       if (nfc->dmac) {
+               ecc->read_page = sunxi_nfc_hw_ecc_read_page_dma;
+               ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage_dma;
+               ecc->write_page = sunxi_nfc_hw_ecc_write_page_dma;
+               nand->options |= NAND_USE_BOUNCE_BUFFER;
+       } else {
+               ecc->read_page = sunxi_nfc_hw_ecc_read_page;
+               ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage;
+               ecc->write_page = sunxi_nfc_hw_ecc_write_page;
+       }
+
+       /* TODO: support DMA for raw accesses and subpage write */
+       ecc->write_subpage = sunxi_nfc_hw_ecc_write_subpage;
        ecc->read_oob_raw = nand_read_oob_std;
        ecc->write_oob_raw = nand_write_oob_std;
        ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage;
@@ -1871,26 +2211,59 @@ static int sunxi_nfc_probe(struct platform_device *pdev)
        if (ret)
                goto out_ahb_clk_unprepare;
 
+       nfc->reset = devm_reset_control_get_optional(dev, "ahb");
+       if (!IS_ERR(nfc->reset)) {
+               ret = reset_control_deassert(nfc->reset);
+               if (ret) {
+                       dev_err(dev, "reset err %d\n", ret);
+                       goto out_mod_clk_unprepare;
+               }
+       } else if (PTR_ERR(nfc->reset) != -ENOENT) {
+               ret = PTR_ERR(nfc->reset);
+               goto out_mod_clk_unprepare;
+       }
+
        ret = sunxi_nfc_rst(nfc);
        if (ret)
-               goto out_mod_clk_unprepare;
+               goto out_ahb_reset_reassert;
 
        writel(0, nfc->regs + NFC_REG_INT);
        ret = devm_request_irq(dev, irq, sunxi_nfc_interrupt,
                               0, "sunxi-nand", nfc);
        if (ret)
-               goto out_mod_clk_unprepare;
+               goto out_ahb_reset_reassert;
+
+       nfc->dmac = dma_request_slave_channel(dev, "rxtx");
+       if (nfc->dmac) {
+               struct dma_slave_config dmac_cfg = { };
+
+               dmac_cfg.src_addr = r->start + NFC_REG_IO_DATA;
+               dmac_cfg.dst_addr = dmac_cfg.src_addr;
+               dmac_cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+               dmac_cfg.dst_addr_width = dmac_cfg.src_addr_width;
+               dmac_cfg.src_maxburst = 4;
+               dmac_cfg.dst_maxburst = 4;
+               dmaengine_slave_config(nfc->dmac, &dmac_cfg);
+       } else {
+               dev_warn(dev, "failed to request rxtx DMA channel\n");
+       }
 
        platform_set_drvdata(pdev, nfc);
 
        ret = sunxi_nand_chips_init(dev, nfc);
        if (ret) {
                dev_err(dev, "failed to init nand chips\n");
-               goto out_mod_clk_unprepare;
+               goto out_release_dmac;
        }
 
        return 0;
 
+out_release_dmac:
+       if (nfc->dmac)
+               dma_release_channel(nfc->dmac);
+out_ahb_reset_reassert:
+       if (!IS_ERR(nfc->reset))
+               reset_control_assert(nfc->reset);
 out_mod_clk_unprepare:
        clk_disable_unprepare(nfc->mod_clk);
 out_ahb_clk_unprepare:
@@ -1904,6 +2277,12 @@ static int sunxi_nfc_remove(struct platform_device *pdev)
        struct sunxi_nfc *nfc = platform_get_drvdata(pdev);
 
        sunxi_nand_chips_cleanup(nfc);
+
+       if (!IS_ERR(nfc->reset))
+               reset_control_assert(nfc->reset);
+
+       if (nfc->dmac)
+               dma_release_channel(nfc->dmac);
        clk_disable_unprepare(nfc->mod_clk);
        clk_disable_unprepare(nfc->ahb_clk);
 
index 0cf0ac0..1f2948c 100644 (file)
@@ -4,6 +4,7 @@
  *  by the Free Software Foundation.
  *
  *  Copyright Â© 2012 John Crispin <blogic@openwrt.org>
+ *  Copyright Â© 2016 Hauke Mehrtens <hauke@hauke-m.de>
  */
 
 #include <linux/mtd/nand.h>
 #define EBU_ADDSEL1            0x24
 #define EBU_NAND_CON           0xB0
 #define EBU_NAND_WAIT          0xB4
+#define  NAND_WAIT_RD          BIT(0) /* NAND flash status output */
+#define  NAND_WAIT_WR_C                BIT(3) /* NAND Write/Read complete */
 #define EBU_NAND_ECC0          0xB8
 #define EBU_NAND_ECC_AC                0xBC
 
-/* nand commands */
-#define NAND_CMD_ALE           (1 << 2)
-#define NAND_CMD_CLE           (1 << 3)
-#define NAND_CMD_CS            (1 << 4)
-#define NAND_WRITE_CMD_RESET   0xff
+/*
+ * nand commands
+ * The pins of the NAND chip are selected based on the address bits of the
+ * "register" read and write. There are no special registers, but an
+ * address range and the lower address bits are used to activate the
+ * correct line. For example when the bit (1 << 2) is set in the address
+ * the ALE pin will be activated.
+ */
+#define NAND_CMD_ALE           BIT(2) /* address latch enable */
+#define NAND_CMD_CLE           BIT(3) /* command latch enable */
+#define NAND_CMD_CS            BIT(4) /* chip select */
+#define NAND_CMD_SE            BIT(5) /* spare area access latch */
+#define NAND_CMD_WP            BIT(6) /* write protect */
 #define NAND_WRITE_CMD         (NAND_CMD_CS | NAND_CMD_CLE)
 #define NAND_WRITE_ADDR                (NAND_CMD_CS | NAND_CMD_ALE)
 #define NAND_WRITE_DATA                (NAND_CMD_CS)
 #define NAND_READ_DATA         (NAND_CMD_CS)
-#define NAND_WAIT_WR_C         (1 << 3)
-#define NAND_WAIT_RD           (0x1)
 
 /* we need to tel the ebu which addr we mapped the nand to */
 #define ADDSEL1_MASK(x)                (x << 4)
 #define NAND_CON_CSMUX         (1 << 1)
 #define NAND_CON_NANDM         1
 
-static void xway_reset_chip(struct nand_chip *chip)
+struct xway_nand_data {
+       struct nand_chip        chip;
+       unsigned long           csflags;
+       void __iomem            *nandaddr;
+};
+
+static u8 xway_readb(struct mtd_info *mtd, int op)
 {
-       unsigned long nandaddr = (unsigned long) chip->IO_ADDR_W;
-       unsigned long flags;
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct xway_nand_data *data = nand_get_controller_data(chip);
 
-       nandaddr &= ~NAND_WRITE_ADDR;
-       nandaddr |= NAND_WRITE_CMD;
+       return readb(data->nandaddr + op);
+}
 
-       /* finish with a reset */
-       spin_lock_irqsave(&ebu_lock, flags);
-       writeb(NAND_WRITE_CMD_RESET, (void __iomem *) nandaddr);
-       while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0)
-               ;
-       spin_unlock_irqrestore(&ebu_lock, flags);
+static void xway_writeb(struct mtd_info *mtd, int op, u8 value)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct xway_nand_data *data = nand_get_controller_data(chip);
+
+       writeb(value, data->nandaddr + op);
 }
 
-static void xway_select_chip(struct mtd_info *mtd, int chip)
+static void xway_select_chip(struct mtd_info *mtd, int select)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct xway_nand_data *data = nand_get_controller_data(chip);
 
-       switch (chip) {
+       switch (select) {
        case -1:
                ltq_ebu_w32_mask(NAND_CON_CE, 0, EBU_NAND_CON);
                ltq_ebu_w32_mask(NAND_CON_NANDM, 0, EBU_NAND_CON);
+               spin_unlock_irqrestore(&ebu_lock, data->csflags);
                break;
        case 0:
+               spin_lock_irqsave(&ebu_lock, data->csflags);
                ltq_ebu_w32_mask(0, NAND_CON_NANDM, EBU_NAND_CON);
                ltq_ebu_w32_mask(0, NAND_CON_CE, EBU_NAND_CON);
                break;
@@ -89,26 +108,16 @@ static void xway_select_chip(struct mtd_info *mtd, int chip)
 
 static void xway_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
 {
-       struct nand_chip *this = mtd_to_nand(mtd);
-       unsigned long nandaddr = (unsigned long) this->IO_ADDR_W;
-       unsigned long flags;
-
-       if (ctrl & NAND_CTRL_CHANGE) {
-               nandaddr &= ~(NAND_WRITE_CMD | NAND_WRITE_ADDR);
-               if (ctrl & NAND_CLE)
-                       nandaddr |= NAND_WRITE_CMD;
-               else
-                       nandaddr |= NAND_WRITE_ADDR;
-               this->IO_ADDR_W = (void __iomem *) nandaddr;
-       }
+       if (cmd == NAND_CMD_NONE)
+               return;
 
-       if (cmd != NAND_CMD_NONE) {
-               spin_lock_irqsave(&ebu_lock, flags);
-               writeb(cmd, this->IO_ADDR_W);
-               while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0)
-                       ;
-               spin_unlock_irqrestore(&ebu_lock, flags);
-       }
+       if (ctrl & NAND_CLE)
+               xway_writeb(mtd, NAND_WRITE_CMD, cmd);
+       else if (ctrl & NAND_ALE)
+               xway_writeb(mtd, NAND_WRITE_ADDR, cmd);
+
+       while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0)
+               ;
 }
 
 static int xway_dev_ready(struct mtd_info *mtd)
@@ -118,80 +127,122 @@ static int xway_dev_ready(struct mtd_info *mtd)
 
 static unsigned char xway_read_byte(struct mtd_info *mtd)
 {
-       struct nand_chip *this = mtd_to_nand(mtd);
-       unsigned long nandaddr = (unsigned long) this->IO_ADDR_R;
-       unsigned long flags;
-       int ret;
+       return xway_readb(mtd, NAND_READ_DATA);
+}
+
+static void xway_read_buf(struct mtd_info *mtd, u_char *buf, int len)
+{
+       int i;
 
-       spin_lock_irqsave(&ebu_lock, flags);
-       ret = ltq_r8((void __iomem *)(nandaddr + NAND_READ_DATA));
-       spin_unlock_irqrestore(&ebu_lock, flags);
+       for (i = 0; i < len; i++)
+               buf[i] = xway_readb(mtd, NAND_WRITE_DATA);
+}
 
-       return ret;
+static void xway_write_buf(struct mtd_info *mtd, const u_char *buf, int len)
+{
+       int i;
+
+       for (i = 0; i < len; i++)
+               xway_writeb(mtd, NAND_WRITE_DATA, buf[i]);
 }
 
+/*
+ * Probe for the NAND device.
+ */
 static int xway_nand_probe(struct platform_device *pdev)
 {
-       struct nand_chip *this = platform_get_drvdata(pdev);
-       unsigned long nandaddr = (unsigned long) this->IO_ADDR_W;
-       const __be32 *cs = of_get_property(pdev->dev.of_node,
-                                       "lantiq,cs", NULL);
+       struct xway_nand_data *data;
+       struct mtd_info *mtd;
+       struct resource *res;
+       int err;
+       u32 cs;
        u32 cs_flag = 0;
 
+       /* Allocate memory for the device structure (and zero it) */
+       data = devm_kzalloc(&pdev->dev, sizeof(struct xway_nand_data),
+                           GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       data->nandaddr = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(data->nandaddr))
+               return PTR_ERR(data->nandaddr);
+
+       nand_set_flash_node(&data->chip, pdev->dev.of_node);
+       mtd = nand_to_mtd(&data->chip);
+       mtd->dev.parent = &pdev->dev;
+
+       data->chip.cmd_ctrl = xway_cmd_ctrl;
+       data->chip.dev_ready = xway_dev_ready;
+       data->chip.select_chip = xway_select_chip;
+       data->chip.write_buf = xway_write_buf;
+       data->chip.read_buf = xway_read_buf;
+       data->chip.read_byte = xway_read_byte;
+       data->chip.chip_delay = 30;
+
+       data->chip.ecc.mode = NAND_ECC_SOFT;
+       data->chip.ecc.algo = NAND_ECC_HAMMING;
+
+       platform_set_drvdata(pdev, data);
+       nand_set_controller_data(&data->chip, data);
+
        /* load our CS from the DT. Either we find a valid 1 or default to 0 */
-       if (cs && (*cs == 1))
+       err = of_property_read_u32(pdev->dev.of_node, "lantiq,cs", &cs);
+       if (!err && cs == 1)
                cs_flag = NAND_CON_IN_CS1 | NAND_CON_OUT_CS1;
 
        /* setup the EBU to run in NAND mode on our base addr */
-       ltq_ebu_w32(CPHYSADDR(nandaddr)
-               | ADDSEL1_MASK(3) | ADDSEL1_REGEN, EBU_ADDSEL1);
+       ltq_ebu_w32(CPHYSADDR(data->nandaddr)
+                   | ADDSEL1_MASK(3) | ADDSEL1_REGEN, EBU_ADDSEL1);
 
        ltq_ebu_w32(BUSCON1_SETUP | BUSCON1_BCGEN_RES | BUSCON1_WAITWRC2
-               | BUSCON1_WAITRDC2 | BUSCON1_HOLDC1 | BUSCON1_RECOVC1
-               | BUSCON1_CMULT4, LTQ_EBU_BUSCON1);
+                   | BUSCON1_WAITRDC2 | BUSCON1_HOLDC1 | BUSCON1_RECOVC1
+                   | BUSCON1_CMULT4, LTQ_EBU_BUSCON1);
 
        ltq_ebu_w32(NAND_CON_NANDM | NAND_CON_CSMUX | NAND_CON_CS_P
-               | NAND_CON_SE_P | NAND_CON_WP_P | NAND_CON_PRE_P
-               | cs_flag, EBU_NAND_CON);
+                   | NAND_CON_SE_P | NAND_CON_WP_P | NAND_CON_PRE_P
+                   | cs_flag, EBU_NAND_CON);
 
-       /* finish with a reset */
-       xway_reset_chip(this);
+       /* Scan to find existence of the device */
+       err = nand_scan(mtd, 1);
+       if (err)
+               return err;
 
-       return 0;
-}
+       err = mtd_device_register(mtd, NULL, 0);
+       if (err)
+               nand_release(mtd);
 
-static struct platform_nand_data xway_nand_data = {
-       .chip = {
-               .nr_chips               = 1,
-               .chip_delay             = 30,
-       },
-       .ctrl = {
-               .probe          = xway_nand_probe,
-               .cmd_ctrl       = xway_cmd_ctrl,
-               .dev_ready      = xway_dev_ready,
-               .select_chip    = xway_select_chip,
-               .read_byte      = xway_read_byte,
-       }
-};
+       return err;
+}
 
 /*
- * Try to find the node inside the DT. If it is available attach out
- * platform_nand_data
+ * Remove a NAND device.
  */
-static int __init xway_register_nand(void)
+static int xway_nand_remove(struct platform_device *pdev)
 {
-       struct device_node *node;
-       struct platform_device *pdev;
-
-       node = of_find_compatible_node(NULL, NULL, "lantiq,nand-xway");
-       if (!node)
-               return -ENOENT;
-       pdev = of_find_device_by_node(node);
-       if (!pdev)
-               return -EINVAL;
-       pdev->dev.platform_data = &xway_nand_data;
-       of_node_put(node);
+       struct xway_nand_data *data = platform_get_drvdata(pdev);
+
+       nand_release(nand_to_mtd(&data->chip));
+
        return 0;
 }
 
-subsys_initcall(xway_register_nand);
+static const struct of_device_id xway_nand_match[] = {
+       { .compatible = "lantiq,nand-xway" },
+       {},
+};
+MODULE_DEVICE_TABLE(of, xway_nand_match);
+
+static struct platform_driver xway_nand_driver = {
+       .probe  = xway_nand_probe,
+       .remove = xway_nand_remove,
+       .driver = {
+               .name           = "lantiq,nand-xway",
+               .of_match_table = xway_nand_match,
+       },
+};
+
+module_platform_driver(xway_nand_driver);
+
+MODULE_LICENSE("GPL");
index a4b029a..1a6d0e3 100644 (file)
@@ -3188,13 +3188,13 @@ static int onenand_otp_walk(struct mtd_info *mtd, loff_t from, size_t len,
                        size_t tmp_retlen;
 
                        ret = action(mtd, from, len, &tmp_retlen, buf);
+                       if (ret)
+                               break;
 
                        buf += tmp_retlen;
                        len -= tmp_retlen;
                        *retlen += tmp_retlen;
 
-                       if (ret)
-                               break;
                }
                otp_pages--;
        }
index d42c98e..4a682ee 100644 (file)
@@ -29,6 +29,26 @@ config MTD_SPI_NOR_USE_4K_SECTORS
          Please note that some tools/drivers/filesystems may not work with
          4096 B erase size (e.g. UBIFS requires 15 KiB as a minimum).
 
+config SPI_ATMEL_QUADSPI
+       tristate "Atmel Quad SPI Controller"
+       depends on ARCH_AT91 || (ARM && COMPILE_TEST)
+       depends on OF && HAS_IOMEM
+       help
+         This enables support for the Quad SPI controller in master mode.
+         This driver does not support generic SPI. The implementation only
+         supports SPI NOR.
+
+config SPI_CADENCE_QUADSPI
+       tristate "Cadence Quad SPI controller"
+       depends on OF && ARM
+       help
+         Enable support for the Cadence Quad SPI Flash controller.
+
+         Cadence QSPI is a specialized controller for connecting an SPI
+         Flash over 1/2/4-bit wide bus. Enable this option if you have a
+         device with a Cadence QSPI controller and want to access the
+         Flash as an MTD device.
+
 config SPI_FSL_QUADSPI
        tristate "Freescale Quad SPI controller"
        depends on ARCH_MXC || SOC_LS1021A || ARCH_LAYERSCAPE || COMPILE_TEST
@@ -38,6 +58,13 @@ config SPI_FSL_QUADSPI
          This controller does not support generic SPI. It only supports
          SPI NOR.
 
+config SPI_HISI_SFC
+       tristate "Hisilicon SPI-NOR Flash Controller(SFC)"
+       depends on ARCH_HISI || COMPILE_TEST
+       depends on HAS_IOMEM && HAS_DMA
+       help
+         This enables support for hisilicon SPI-NOR flash controller.
+
 config SPI_NXP_SPIFI
        tristate "NXP SPI Flash Interface (SPIFI)"
        depends on OF && (ARCH_LPC18XX || COMPILE_TEST)
index 0bf3a7f..121695e 100644 (file)
@@ -1,4 +1,7 @@
 obj-$(CONFIG_MTD_SPI_NOR)      += spi-nor.o
+obj-$(CONFIG_SPI_ATMEL_QUADSPI)        += atmel-quadspi.o
+obj-$(CONFIG_SPI_CADENCE_QUADSPI)      += cadence-quadspi.o
 obj-$(CONFIG_SPI_FSL_QUADSPI)  += fsl-quadspi.o
+obj-$(CONFIG_SPI_HISI_SFC)     += hisi-sfc.o
 obj-$(CONFIG_MTD_MT81xx_NOR)    += mtk-quadspi.o
 obj-$(CONFIG_SPI_NXP_SPIFI)    += nxp-spifi.o
diff --git a/drivers/mtd/spi-nor/atmel-quadspi.c b/drivers/mtd/spi-nor/atmel-quadspi.c
new file mode 100644 (file)
index 0000000..47937d9
--- /dev/null
@@ -0,0 +1,732 @@
+/*
+ * Driver for Atmel QSPI Controller
+ *
+ * Copyright (C) 2015 Atmel Corporation
+ *
+ * Author: Cyrille Pitchen <cyrille.pitchen@atmel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This driver is based on drivers/mtd/spi-nor/fsl-quadspi.c from Freescale.
+ */
+
+#include <linux/kernel.h>
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/spi-nor.h>
+#include <linux/platform_data/atmel.h>
+#include <linux/of.h>
+
+#include <linux/io.h>
+#include <linux/gpio.h>
+#include <linux/pinctrl/consumer.h>
+
+/* QSPI register offsets */
+#define QSPI_CR      0x0000  /* Control Register */
+#define QSPI_MR      0x0004  /* Mode Register */
+#define QSPI_RD      0x0008  /* Receive Data Register */
+#define QSPI_TD      0x000c  /* Transmit Data Register */
+#define QSPI_SR      0x0010  /* Status Register */
+#define QSPI_IER     0x0014  /* Interrupt Enable Register */
+#define QSPI_IDR     0x0018  /* Interrupt Disable Register */
+#define QSPI_IMR     0x001c  /* Interrupt Mask Register */
+#define QSPI_SCR     0x0020  /* Serial Clock Register */
+
+#define QSPI_IAR     0x0030  /* Instruction Address Register */
+#define QSPI_ICR     0x0034  /* Instruction Code Register */
+#define QSPI_IFR     0x0038  /* Instruction Frame Register */
+
+#define QSPI_SMR     0x0040  /* Scrambling Mode Register */
+#define QSPI_SKR     0x0044  /* Scrambling Key Register */
+
+#define QSPI_WPMR    0x00E4  /* Write Protection Mode Register */
+#define QSPI_WPSR    0x00E8  /* Write Protection Status Register */
+
+#define QSPI_VERSION 0x00FC  /* Version Register */
+
+
+/* Bitfields in QSPI_CR (Control Register) */
+#define QSPI_CR_QSPIEN                  BIT(0)
+#define QSPI_CR_QSPIDIS                 BIT(1)
+#define QSPI_CR_SWRST                   BIT(7)
+#define QSPI_CR_LASTXFER                BIT(24)
+
+/* Bitfields in QSPI_MR (Mode Register) */
+#define QSPI_MR_SSM                     BIT(0)
+#define QSPI_MR_LLB                     BIT(1)
+#define QSPI_MR_WDRBT                   BIT(2)
+#define QSPI_MR_SMRM                    BIT(3)
+#define QSPI_MR_CSMODE_MASK             GENMASK(5, 4)
+#define QSPI_MR_CSMODE_NOT_RELOADED     (0 << 4)
+#define QSPI_MR_CSMODE_LASTXFER         (1 << 4)
+#define QSPI_MR_CSMODE_SYSTEMATICALLY   (2 << 4)
+#define QSPI_MR_NBBITS_MASK             GENMASK(11, 8)
+#define QSPI_MR_NBBITS(n)               ((((n) - 8) << 8) & QSPI_MR_NBBITS_MASK)
+#define QSPI_MR_DLYBCT_MASK             GENMASK(23, 16)
+#define QSPI_MR_DLYBCT(n)               (((n) << 16) & QSPI_MR_DLYBCT_MASK)
+#define QSPI_MR_DLYCS_MASK              GENMASK(31, 24)
+#define QSPI_MR_DLYCS(n)                (((n) << 24) & QSPI_MR_DLYCS_MASK)
+
+/* Bitfields in QSPI_SR/QSPI_IER/QSPI_IDR/QSPI_IMR  */
+#define QSPI_SR_RDRF                    BIT(0)
+#define QSPI_SR_TDRE                    BIT(1)
+#define QSPI_SR_TXEMPTY                 BIT(2)
+#define QSPI_SR_OVRES                   BIT(3)
+#define QSPI_SR_CSR                     BIT(8)
+#define QSPI_SR_CSS                     BIT(9)
+#define QSPI_SR_INSTRE                  BIT(10)
+#define QSPI_SR_QSPIENS                 BIT(24)
+
+#define QSPI_SR_CMD_COMPLETED  (QSPI_SR_INSTRE | QSPI_SR_CSR)
+
+/* Bitfields in QSPI_SCR (Serial Clock Register) */
+#define QSPI_SCR_CPOL                   BIT(0)
+#define QSPI_SCR_CPHA                   BIT(1)
+#define QSPI_SCR_SCBR_MASK              GENMASK(15, 8)
+#define QSPI_SCR_SCBR(n)                (((n) << 8) & QSPI_SCR_SCBR_MASK)
+#define QSPI_SCR_DLYBS_MASK             GENMASK(23, 16)
+#define QSPI_SCR_DLYBS(n)               (((n) << 16) & QSPI_SCR_DLYBS_MASK)
+
+/* Bitfields in QSPI_ICR (Instruction Code Register) */
+#define QSPI_ICR_INST_MASK              GENMASK(7, 0)
+#define QSPI_ICR_INST(inst)             (((inst) << 0) & QSPI_ICR_INST_MASK)
+#define QSPI_ICR_OPT_MASK               GENMASK(23, 16)
+#define QSPI_ICR_OPT(opt)               (((opt) << 16) & QSPI_ICR_OPT_MASK)
+
+/* Bitfields in QSPI_IFR (Instruction Frame Register) */
+#define QSPI_IFR_WIDTH_MASK             GENMASK(2, 0)
+#define QSPI_IFR_WIDTH_SINGLE_BIT_SPI   (0 << 0)
+#define QSPI_IFR_WIDTH_DUAL_OUTPUT      (1 << 0)
+#define QSPI_IFR_WIDTH_QUAD_OUTPUT      (2 << 0)
+#define QSPI_IFR_WIDTH_DUAL_IO          (3 << 0)
+#define QSPI_IFR_WIDTH_QUAD_IO          (4 << 0)
+#define QSPI_IFR_WIDTH_DUAL_CMD         (5 << 0)
+#define QSPI_IFR_WIDTH_QUAD_CMD         (6 << 0)
+#define QSPI_IFR_INSTEN                 BIT(4)
+#define QSPI_IFR_ADDREN                 BIT(5)
+#define QSPI_IFR_OPTEN                  BIT(6)
+#define QSPI_IFR_DATAEN                 BIT(7)
+#define QSPI_IFR_OPTL_MASK              GENMASK(9, 8)
+#define QSPI_IFR_OPTL_1BIT              (0 << 8)
+#define QSPI_IFR_OPTL_2BIT              (1 << 8)
+#define QSPI_IFR_OPTL_4BIT              (2 << 8)
+#define QSPI_IFR_OPTL_8BIT              (3 << 8)
+#define QSPI_IFR_ADDRL                  BIT(10)
+#define QSPI_IFR_TFRTYP_MASK            GENMASK(13, 12)
+#define QSPI_IFR_TFRTYP_TRSFR_READ      (0 << 12)
+#define QSPI_IFR_TFRTYP_TRSFR_READ_MEM  (1 << 12)
+#define QSPI_IFR_TFRTYP_TRSFR_WRITE     (2 << 12)
+#define QSPI_IFR_TFRTYP_TRSFR_WRITE_MEM (3 << 13)
+#define QSPI_IFR_CRM                    BIT(14)
+#define QSPI_IFR_NBDUM_MASK             GENMASK(20, 16)
+#define QSPI_IFR_NBDUM(n)               (((n) << 16) & QSPI_IFR_NBDUM_MASK)
+
+/* Bitfields in QSPI_SMR (Scrambling Mode Register) */
+#define QSPI_SMR_SCREN                  BIT(0)
+#define QSPI_SMR_RVDIS                  BIT(1)
+
+/* Bitfields in QSPI_WPMR (Write Protection Mode Register) */
+#define QSPI_WPMR_WPEN                  BIT(0)
+#define QSPI_WPMR_WPKEY_MASK            GENMASK(31, 8)
+#define QSPI_WPMR_WPKEY(wpkey)          (((wpkey) << 8) & QSPI_WPMR_WPKEY_MASK)
+
+/* Bitfields in QSPI_WPSR (Write Protection Status Register) */
+#define QSPI_WPSR_WPVS                  BIT(0)
+#define QSPI_WPSR_WPVSRC_MASK           GENMASK(15, 8)
+#define QSPI_WPSR_WPVSRC(src)           (((src) << 8) & QSPI_WPSR_WPVSRC)
+
+
+struct atmel_qspi {
+       void __iomem            *regs;
+       void __iomem            *mem;
+       struct clk              *clk;
+       struct platform_device  *pdev;
+       u32                     pending;
+
+       struct spi_nor          nor;
+       u32                     clk_rate;
+       struct completion       cmd_completion;
+};
+
+struct atmel_qspi_command {
+       union {
+               struct {
+                       u32     instruction:1;
+                       u32     address:3;
+                       u32     mode:1;
+                       u32     dummy:1;
+                       u32     data:1;
+                       u32     reserved:25;
+               }               bits;
+               u32     word;
+       }       enable;
+       u8      instruction;
+       u8      mode;
+       u8      num_mode_cycles;
+       u8      num_dummy_cycles;
+       u32     address;
+
+       size_t          buf_len;
+       const void      *tx_buf;
+       void            *rx_buf;
+};
+
+/* Register access functions */
+static inline u32 qspi_readl(struct atmel_qspi *aq, u32 reg)
+{
+       return readl_relaxed(aq->regs + reg);
+}
+
+static inline void qspi_writel(struct atmel_qspi *aq, u32 reg, u32 value)
+{
+       writel_relaxed(value, aq->regs + reg);
+}
+
+static int atmel_qspi_run_transfer(struct atmel_qspi *aq,
+                                  const struct atmel_qspi_command *cmd)
+{
+       void __iomem *ahb_mem;
+
+       /* Then fallback to a PIO transfer (memcpy() DOES NOT work!) */
+       ahb_mem = aq->mem;
+       if (cmd->enable.bits.address)
+               ahb_mem += cmd->address;
+       if (cmd->tx_buf)
+               _memcpy_toio(ahb_mem, cmd->tx_buf, cmd->buf_len);
+       else
+               _memcpy_fromio(cmd->rx_buf, ahb_mem, cmd->buf_len);
+
+       return 0;
+}
+
+#ifdef DEBUG
+static void atmel_qspi_debug_command(struct atmel_qspi *aq,
+                                    const struct atmel_qspi_command *cmd,
+                                    u32 ifr)
+{
+       u8 cmd_buf[SPI_NOR_MAX_CMD_SIZE];
+       size_t len = 0;
+       int i;
+
+       if (cmd->enable.bits.instruction)
+               cmd_buf[len++] = cmd->instruction;
+
+       for (i = cmd->enable.bits.address-1; i >= 0; --i)
+               cmd_buf[len++] = (cmd->address >> (i << 3)) & 0xff;
+
+       if (cmd->enable.bits.mode)
+               cmd_buf[len++] = cmd->mode;
+
+       if (cmd->enable.bits.dummy) {
+               int num = cmd->num_dummy_cycles;
+
+               switch (ifr & QSPI_IFR_WIDTH_MASK) {
+               case QSPI_IFR_WIDTH_SINGLE_BIT_SPI:
+               case QSPI_IFR_WIDTH_DUAL_OUTPUT:
+               case QSPI_IFR_WIDTH_QUAD_OUTPUT:
+                       num >>= 3;
+                       break;
+               case QSPI_IFR_WIDTH_DUAL_IO:
+               case QSPI_IFR_WIDTH_DUAL_CMD:
+                       num >>= 2;
+                       break;
+               case QSPI_IFR_WIDTH_QUAD_IO:
+               case QSPI_IFR_WIDTH_QUAD_CMD:
+                       num >>= 1;
+                       break;
+               default:
+                       return;
+               }
+
+               for (i = 0; i < num; ++i)
+                       cmd_buf[len++] = 0;
+       }
+
+       /* Dump the SPI command */
+       print_hex_dump(KERN_DEBUG, "qspi cmd: ", DUMP_PREFIX_NONE,
+                      32, 1, cmd_buf, len, false);
+
+#ifdef VERBOSE_DEBUG
+       /* If verbose debug is enabled, also dump the TX data */
+       if (cmd->enable.bits.data && cmd->tx_buf)
+               print_hex_dump(KERN_DEBUG, "qspi tx : ", DUMP_PREFIX_NONE,
+                              32, 1, cmd->tx_buf, cmd->buf_len, false);
+#endif
+}
+#else
+#define atmel_qspi_debug_command(aq, cmd, ifr)
+#endif
+
+static int atmel_qspi_run_command(struct atmel_qspi *aq,
+                                 const struct atmel_qspi_command *cmd,
+                                 u32 ifr_tfrtyp, u32 ifr_width)
+{
+       u32 iar, icr, ifr, sr;
+       int err = 0;
+
+       iar = 0;
+       icr = 0;
+       ifr = ifr_tfrtyp | ifr_width;
+
+       /* Compute instruction parameters */
+       if (cmd->enable.bits.instruction) {
+               icr |= QSPI_ICR_INST(cmd->instruction);
+               ifr |= QSPI_IFR_INSTEN;
+       }
+
+       /* Compute address parameters */
+       switch (cmd->enable.bits.address) {
+       case 4:
+               ifr |= QSPI_IFR_ADDRL;
+               /* fall through to the 24bit (3 byte) address case. */
+       case 3:
+               iar = (cmd->enable.bits.data) ? 0 : cmd->address;
+               ifr |= QSPI_IFR_ADDREN;
+               break;
+       case 0:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       /* Compute option parameters */
+       if (cmd->enable.bits.mode && cmd->num_mode_cycles) {
+               u32 mode_cycle_bits, mode_bits;
+
+               icr |= QSPI_ICR_OPT(cmd->mode);
+               ifr |= QSPI_IFR_OPTEN;
+
+               switch (ifr & QSPI_IFR_WIDTH_MASK) {
+               case QSPI_IFR_WIDTH_SINGLE_BIT_SPI:
+               case QSPI_IFR_WIDTH_DUAL_OUTPUT:
+               case QSPI_IFR_WIDTH_QUAD_OUTPUT:
+                       mode_cycle_bits = 1;
+                       break;
+               case QSPI_IFR_WIDTH_DUAL_IO:
+               case QSPI_IFR_WIDTH_DUAL_CMD:
+                       mode_cycle_bits = 2;
+                       break;
+               case QSPI_IFR_WIDTH_QUAD_IO:
+               case QSPI_IFR_WIDTH_QUAD_CMD:
+                       mode_cycle_bits = 4;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+
+               mode_bits = cmd->num_mode_cycles * mode_cycle_bits;
+               switch (mode_bits) {
+               case 1:
+                       ifr |= QSPI_IFR_OPTL_1BIT;
+                       break;
+
+               case 2:
+                       ifr |= QSPI_IFR_OPTL_2BIT;
+                       break;
+
+               case 4:
+                       ifr |= QSPI_IFR_OPTL_4BIT;
+                       break;
+
+               case 8:
+                       ifr |= QSPI_IFR_OPTL_8BIT;
+                       break;
+
+               default:
+                       return -EINVAL;
+               }
+       }
+
+       /* Set number of dummy cycles */
+       if (cmd->enable.bits.dummy)
+               ifr |= QSPI_IFR_NBDUM(cmd->num_dummy_cycles);
+
+       /* Set data enable */
+       if (cmd->enable.bits.data) {
+               ifr |= QSPI_IFR_DATAEN;
+
+               /* Special case for Continuous Read Mode */
+               if (!cmd->tx_buf && !cmd->rx_buf)
+                       ifr |= QSPI_IFR_CRM;
+       }
+
+       /* Clear pending interrupts */
+       (void)qspi_readl(aq, QSPI_SR);
+
+       /* Set QSPI Instruction Frame registers */
+       atmel_qspi_debug_command(aq, cmd, ifr);
+       qspi_writel(aq, QSPI_IAR, iar);
+       qspi_writel(aq, QSPI_ICR, icr);
+       qspi_writel(aq, QSPI_IFR, ifr);
+
+       /* Skip to the final steps if there is no data */
+       if (!cmd->enable.bits.data)
+               goto no_data;
+
+       /* Dummy read of QSPI_IFR to synchronize APB and AHB accesses */
+       (void)qspi_readl(aq, QSPI_IFR);
+
+       /* Stop here for continuous read */
+       if (!cmd->tx_buf && !cmd->rx_buf)
+               return 0;
+       /* Send/Receive data */
+       err = atmel_qspi_run_transfer(aq, cmd);
+
+       /* Release the chip-select */
+       qspi_writel(aq, QSPI_CR, QSPI_CR_LASTXFER);
+
+       if (err)
+               return err;
+
+#if defined(DEBUG) && defined(VERBOSE_DEBUG)
+       /*
+        * If verbose debug is enabled, also dump the RX data in addition to
+        * the SPI command previously dumped by atmel_qspi_debug_command()
+        */
+       if (cmd->rx_buf)
+               print_hex_dump(KERN_DEBUG, "qspi rx : ", DUMP_PREFIX_NONE,
+                              32, 1, cmd->rx_buf, cmd->buf_len, false);
+#endif
+no_data:
+       /* Poll INSTRuction End status */
+       sr = qspi_readl(aq, QSPI_SR);
+       if ((sr & QSPI_SR_CMD_COMPLETED) == QSPI_SR_CMD_COMPLETED)
+               return err;
+
+       /* Wait for INSTRuction End interrupt */
+       reinit_completion(&aq->cmd_completion);
+       aq->pending = sr & QSPI_SR_CMD_COMPLETED;
+       qspi_writel(aq, QSPI_IER, QSPI_SR_CMD_COMPLETED);
+       if (!wait_for_completion_timeout(&aq->cmd_completion,
+                                        msecs_to_jiffies(1000)))
+               err = -ETIMEDOUT;
+       qspi_writel(aq, QSPI_IDR, QSPI_SR_CMD_COMPLETED);
+
+       return err;
+}
+
+static int atmel_qspi_read_reg(struct spi_nor *nor, u8 opcode,
+                              u8 *buf, int len)
+{
+       struct atmel_qspi *aq = nor->priv;
+       struct atmel_qspi_command cmd;
+
+       memset(&cmd, 0, sizeof(cmd));
+       cmd.enable.bits.instruction = 1;
+       cmd.enable.bits.data = 1;
+       cmd.instruction = opcode;
+       cmd.rx_buf = buf;
+       cmd.buf_len = len;
+       return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_READ,
+                                     QSPI_IFR_WIDTH_SINGLE_BIT_SPI);
+}
+
+static int atmel_qspi_write_reg(struct spi_nor *nor, u8 opcode,
+                               u8 *buf, int len)
+{
+       struct atmel_qspi *aq = nor->priv;
+       struct atmel_qspi_command cmd;
+
+       memset(&cmd, 0, sizeof(cmd));
+       cmd.enable.bits.instruction = 1;
+       cmd.enable.bits.data = (buf != NULL && len > 0);
+       cmd.instruction = opcode;
+       cmd.tx_buf = buf;
+       cmd.buf_len = len;
+       return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE,
+                                     QSPI_IFR_WIDTH_SINGLE_BIT_SPI);
+}
+
+static ssize_t atmel_qspi_write(struct spi_nor *nor, loff_t to, size_t len,
+                               const u_char *write_buf)
+{
+       struct atmel_qspi *aq = nor->priv;
+       struct atmel_qspi_command cmd;
+       ssize_t ret;
+
+       memset(&cmd, 0, sizeof(cmd));
+       cmd.enable.bits.instruction = 1;
+       cmd.enable.bits.address = nor->addr_width;
+       cmd.enable.bits.data = 1;
+       cmd.instruction = nor->program_opcode;
+       cmd.address = (u32)to;
+       cmd.tx_buf = write_buf;
+       cmd.buf_len = len;
+       ret = atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE_MEM,
+                                    QSPI_IFR_WIDTH_SINGLE_BIT_SPI);
+       return (ret < 0) ? ret : len;
+}
+
+static int atmel_qspi_erase(struct spi_nor *nor, loff_t offs)
+{
+       struct atmel_qspi *aq = nor->priv;
+       struct atmel_qspi_command cmd;
+
+       memset(&cmd, 0, sizeof(cmd));
+       cmd.enable.bits.instruction = 1;
+       cmd.enable.bits.address = nor->addr_width;
+       cmd.instruction = nor->erase_opcode;
+       cmd.address = (u32)offs;
+       return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE,
+                                     QSPI_IFR_WIDTH_SINGLE_BIT_SPI);
+}
+
+static ssize_t atmel_qspi_read(struct spi_nor *nor, loff_t from, size_t len,
+                              u_char *read_buf)
+{
+       struct atmel_qspi *aq = nor->priv;
+       struct atmel_qspi_command cmd;
+       u8 num_mode_cycles, num_dummy_cycles;
+       u32 ifr_width;
+       ssize_t ret;
+
+       switch (nor->flash_read) {
+       case SPI_NOR_NORMAL:
+       case SPI_NOR_FAST:
+               ifr_width = QSPI_IFR_WIDTH_SINGLE_BIT_SPI;
+               break;
+
+       case SPI_NOR_DUAL:
+               ifr_width = QSPI_IFR_WIDTH_DUAL_OUTPUT;
+               break;
+
+       case SPI_NOR_QUAD:
+               ifr_width = QSPI_IFR_WIDTH_QUAD_OUTPUT;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       if (nor->read_dummy >= 2) {
+               num_mode_cycles = 2;
+               num_dummy_cycles = nor->read_dummy - 2;
+       } else {
+               num_mode_cycles = nor->read_dummy;
+               num_dummy_cycles = 0;
+       }
+
+       memset(&cmd, 0, sizeof(cmd));
+       cmd.enable.bits.instruction = 1;
+       cmd.enable.bits.address = nor->addr_width;
+       cmd.enable.bits.mode = (num_mode_cycles > 0);
+       cmd.enable.bits.dummy = (num_dummy_cycles > 0);
+       cmd.enable.bits.data = 1;
+       cmd.instruction = nor->read_opcode;
+       cmd.address = (u32)from;
+       cmd.mode = 0xff; /* This value prevents from entering the 0-4-4 mode */
+       cmd.num_mode_cycles = num_mode_cycles;
+       cmd.num_dummy_cycles = num_dummy_cycles;
+       cmd.rx_buf = read_buf;
+       cmd.buf_len = len;
+       ret = atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_READ_MEM,
+                                    ifr_width);
+       return (ret < 0) ? ret : len;
+}
+
+static int atmel_qspi_init(struct atmel_qspi *aq)
+{
+       unsigned long src_rate;
+       u32 mr, scr, scbr;
+
+       /* Reset the QSPI controller */
+       qspi_writel(aq, QSPI_CR, QSPI_CR_SWRST);
+
+       /* Set the QSPI controller in Serial Memory Mode */
+       mr = QSPI_MR_NBBITS(8) | QSPI_MR_SSM;
+       qspi_writel(aq, QSPI_MR, mr);
+
+       src_rate = clk_get_rate(aq->clk);
+       if (!src_rate)
+               return -EINVAL;
+
+       /* Compute the QSPI baudrate */
+       scbr = DIV_ROUND_UP(src_rate, aq->clk_rate);
+       if (scbr > 0)
+               scbr--;
+       scr = QSPI_SCR_SCBR(scbr);
+       qspi_writel(aq, QSPI_SCR, scr);
+
+       /* Enable the QSPI controller */
+       qspi_writel(aq, QSPI_CR, QSPI_CR_QSPIEN);
+
+       return 0;
+}
+
+static irqreturn_t atmel_qspi_interrupt(int irq, void *dev_id)
+{
+       struct atmel_qspi *aq = (struct atmel_qspi *)dev_id;
+       u32 status, mask, pending;
+
+       status = qspi_readl(aq, QSPI_SR);
+       mask = qspi_readl(aq, QSPI_IMR);
+       pending = status & mask;
+
+       if (!pending)
+               return IRQ_NONE;
+
+       aq->pending |= pending;
+       if ((aq->pending & QSPI_SR_CMD_COMPLETED) == QSPI_SR_CMD_COMPLETED)
+               complete(&aq->cmd_completion);
+
+       return IRQ_HANDLED;
+}
+
+static int atmel_qspi_probe(struct platform_device *pdev)
+{
+       struct device_node *child, *np = pdev->dev.of_node;
+       struct atmel_qspi *aq;
+       struct resource *res;
+       struct spi_nor *nor;
+       struct mtd_info *mtd;
+       int irq, err = 0;
+
+       if (of_get_child_count(np) != 1)
+               return -ENODEV;
+       child = of_get_next_child(np, NULL);
+
+       aq = devm_kzalloc(&pdev->dev, sizeof(*aq), GFP_KERNEL);
+       if (!aq) {
+               err = -ENOMEM;
+               goto exit;
+       }
+
+       platform_set_drvdata(pdev, aq);
+       init_completion(&aq->cmd_completion);
+       aq->pdev = pdev;
+
+       /* Map the registers */
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi_base");
+       aq->regs = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(aq->regs)) {
+               dev_err(&pdev->dev, "missing registers\n");
+               err = PTR_ERR(aq->regs);
+               goto exit;
+       }
+
+       /* Map the AHB memory */
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi_mmap");
+       aq->mem = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(aq->mem)) {
+               dev_err(&pdev->dev, "missing AHB memory\n");
+               err = PTR_ERR(aq->mem);
+               goto exit;
+       }
+
+       /* Get the peripheral clock */
+       aq->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(aq->clk)) {
+               dev_err(&pdev->dev, "missing peripheral clock\n");
+               err = PTR_ERR(aq->clk);
+               goto exit;
+       }
+
+       /* Enable the peripheral clock */
+       err = clk_prepare_enable(aq->clk);
+       if (err) {
+               dev_err(&pdev->dev, "failed to enable the peripheral clock\n");
+               goto exit;
+       }
+
+       /* Request the IRQ */
+       irq = platform_get_irq(pdev, 0);
+       if (irq < 0) {
+               dev_err(&pdev->dev, "missing IRQ\n");
+               err = irq;
+               goto disable_clk;
+       }
+       err = devm_request_irq(&pdev->dev, irq, atmel_qspi_interrupt,
+                              0, dev_name(&pdev->dev), aq);
+       if (err)
+               goto disable_clk;
+
+       /* Setup the spi-nor */
+       nor = &aq->nor;
+       mtd = &nor->mtd;
+
+       nor->dev = &pdev->dev;
+       spi_nor_set_flash_node(nor, child);
+       nor->priv = aq;
+       mtd->priv = nor;
+
+       nor->read_reg = atmel_qspi_read_reg;
+       nor->write_reg = atmel_qspi_write_reg;
+       nor->read = atmel_qspi_read;
+       nor->write = atmel_qspi_write;
+       nor->erase = atmel_qspi_erase;
+
+       err = of_property_read_u32(child, "spi-max-frequency", &aq->clk_rate);
+       if (err < 0)
+               goto disable_clk;
+
+       err = atmel_qspi_init(aq);
+       if (err)
+               goto disable_clk;
+
+       err = spi_nor_scan(nor, NULL, SPI_NOR_QUAD);
+       if (err)
+               goto disable_clk;
+
+       err = mtd_device_register(mtd, NULL, 0);
+       if (err)
+               goto disable_clk;
+
+       of_node_put(child);
+
+       return 0;
+
+disable_clk:
+       clk_disable_unprepare(aq->clk);
+exit:
+       of_node_put(child);
+
+       return err;
+}
+
+static int atmel_qspi_remove(struct platform_device *pdev)
+{
+       struct atmel_qspi *aq = platform_get_drvdata(pdev);
+
+       mtd_device_unregister(&aq->nor.mtd);
+       qspi_writel(aq, QSPI_CR, QSPI_CR_QSPIDIS);
+       clk_disable_unprepare(aq->clk);
+       return 0;
+}
+
+
+static const struct of_device_id atmel_qspi_dt_ids[] = {
+       { .compatible = "atmel,sama5d2-qspi" },
+       { /* sentinel */ }
+};
+
+MODULE_DEVICE_TABLE(of, atmel_qspi_dt_ids);
+
+static struct platform_driver atmel_qspi_driver = {
+       .driver = {
+               .name   = "atmel_qspi",
+               .of_match_table = atmel_qspi_dt_ids,
+       },
+       .probe          = atmel_qspi_probe,
+       .remove         = atmel_qspi_remove,
+};
+module_platform_driver(atmel_qspi_driver);
+
+MODULE_AUTHOR("Cyrille Pitchen <cyrille.pitchen@atmel.com>");
+MODULE_DESCRIPTION("Atmel QSPI Controller driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c b/drivers/mtd/spi-nor/cadence-quadspi.c
new file mode 100644 (file)
index 0000000..d403ba7
--- /dev/null
@@ -0,0 +1,1299 @@
+/*
+ * Driver for Cadence QSPI Controller
+ *
+ * Copyright Altera Corporation (C) 2012-2014. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/clk.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/spi-nor.h>
+#include <linux/of_device.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/spi/spi.h>
+#include <linux/timer.h>
+
+#define CQSPI_NAME                     "cadence-qspi"
+#define CQSPI_MAX_CHIPSELECT           16
+
+struct cqspi_st;
+
+struct cqspi_flash_pdata {
+       struct spi_nor  nor;
+       struct cqspi_st *cqspi;
+       u32             clk_rate;
+       u32             read_delay;
+       u32             tshsl_ns;
+       u32             tsd2d_ns;
+       u32             tchsh_ns;
+       u32             tslch_ns;
+       u8              inst_width;
+       u8              addr_width;
+       u8              data_width;
+       u8              cs;
+       bool            registered;
+};
+
+struct cqspi_st {
+       struct platform_device  *pdev;
+
+       struct clk              *clk;
+       unsigned int            sclk;
+
+       void __iomem            *iobase;
+       void __iomem            *ahb_base;
+       struct completion       transfer_complete;
+       struct mutex            bus_mutex;
+
+       int                     current_cs;
+       int                     current_page_size;
+       int                     current_erase_size;
+       int                     current_addr_width;
+       unsigned long           master_ref_clk_hz;
+       bool                    is_decoded_cs;
+       u32                     fifo_depth;
+       u32                     fifo_width;
+       u32                     trigger_address;
+       struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT];
+};
+
+/* Operation timeout value */
+#define CQSPI_TIMEOUT_MS                       500
+#define CQSPI_READ_TIMEOUT_MS                  10
+
+/* Instruction type */
+#define CQSPI_INST_TYPE_SINGLE                 0
+#define CQSPI_INST_TYPE_DUAL                   1
+#define CQSPI_INST_TYPE_QUAD                   2
+
+#define CQSPI_DUMMY_CLKS_PER_BYTE              8
+#define CQSPI_DUMMY_BYTES_MAX                  4
+#define CQSPI_DUMMY_CLKS_MAX                   31
+
+#define CQSPI_STIG_DATA_LEN_MAX                        8
+
+/* Register map */
+#define CQSPI_REG_CONFIG                       0x00
+#define CQSPI_REG_CONFIG_ENABLE_MASK           BIT(0)
+#define CQSPI_REG_CONFIG_DECODE_MASK           BIT(9)
+#define CQSPI_REG_CONFIG_CHIPSELECT_LSB                10
+#define CQSPI_REG_CONFIG_DMA_MASK              BIT(15)
+#define CQSPI_REG_CONFIG_BAUD_LSB              19
+#define CQSPI_REG_CONFIG_IDLE_LSB              31
+#define CQSPI_REG_CONFIG_CHIPSELECT_MASK       0xF
+#define CQSPI_REG_CONFIG_BAUD_MASK             0xF
+
+#define CQSPI_REG_RD_INSTR                     0x04
+#define CQSPI_REG_RD_INSTR_OPCODE_LSB          0
+#define CQSPI_REG_RD_INSTR_TYPE_INSTR_LSB      8
+#define CQSPI_REG_RD_INSTR_TYPE_ADDR_LSB       12
+#define CQSPI_REG_RD_INSTR_TYPE_DATA_LSB       16
+#define CQSPI_REG_RD_INSTR_MODE_EN_LSB         20
+#define CQSPI_REG_RD_INSTR_DUMMY_LSB           24
+#define CQSPI_REG_RD_INSTR_TYPE_INSTR_MASK     0x3
+#define CQSPI_REG_RD_INSTR_TYPE_ADDR_MASK      0x3
+#define CQSPI_REG_RD_INSTR_TYPE_DATA_MASK      0x3
+#define CQSPI_REG_RD_INSTR_DUMMY_MASK          0x1F
+
+#define CQSPI_REG_WR_INSTR                     0x08
+#define CQSPI_REG_WR_INSTR_OPCODE_LSB          0
+#define CQSPI_REG_WR_INSTR_TYPE_ADDR_LSB       12
+#define CQSPI_REG_WR_INSTR_TYPE_DATA_LSB       16
+
+#define CQSPI_REG_DELAY                                0x0C
+#define CQSPI_REG_DELAY_TSLCH_LSB              0
+#define CQSPI_REG_DELAY_TCHSH_LSB              8
+#define CQSPI_REG_DELAY_TSD2D_LSB              16
+#define CQSPI_REG_DELAY_TSHSL_LSB              24
+#define CQSPI_REG_DELAY_TSLCH_MASK             0xFF
+#define CQSPI_REG_DELAY_TCHSH_MASK             0xFF
+#define CQSPI_REG_DELAY_TSD2D_MASK             0xFF
+#define CQSPI_REG_DELAY_TSHSL_MASK             0xFF
+
+#define CQSPI_REG_READCAPTURE                  0x10
+#define CQSPI_REG_READCAPTURE_BYPASS_LSB       0
+#define CQSPI_REG_READCAPTURE_DELAY_LSB                1
+#define CQSPI_REG_READCAPTURE_DELAY_MASK       0xF
+
+#define CQSPI_REG_SIZE                         0x14
+#define CQSPI_REG_SIZE_ADDRESS_LSB             0
+#define CQSPI_REG_SIZE_PAGE_LSB                        4
+#define CQSPI_REG_SIZE_BLOCK_LSB               16
+#define CQSPI_REG_SIZE_ADDRESS_MASK            0xF
+#define CQSPI_REG_SIZE_PAGE_MASK               0xFFF
+#define CQSPI_REG_SIZE_BLOCK_MASK              0x3F
+
+#define CQSPI_REG_SRAMPARTITION                        0x18
+#define CQSPI_REG_INDIRECTTRIGGER              0x1C
+
+#define CQSPI_REG_DMA                          0x20
+#define CQSPI_REG_DMA_SINGLE_LSB               0
+#define CQSPI_REG_DMA_BURST_LSB                        8
+#define CQSPI_REG_DMA_SINGLE_MASK              0xFF
+#define CQSPI_REG_DMA_BURST_MASK               0xFF
+
+#define CQSPI_REG_REMAP                                0x24
+#define CQSPI_REG_MODE_BIT                     0x28
+
+#define CQSPI_REG_SDRAMLEVEL                   0x2C
+#define CQSPI_REG_SDRAMLEVEL_RD_LSB            0
+#define CQSPI_REG_SDRAMLEVEL_WR_LSB            16
+#define CQSPI_REG_SDRAMLEVEL_RD_MASK           0xFFFF
+#define CQSPI_REG_SDRAMLEVEL_WR_MASK           0xFFFF
+
+#define CQSPI_REG_IRQSTATUS                    0x40
+#define CQSPI_REG_IRQMASK                      0x44
+
+#define CQSPI_REG_INDIRECTRD                   0x60
+#define CQSPI_REG_INDIRECTRD_START_MASK                BIT(0)
+#define CQSPI_REG_INDIRECTRD_CANCEL_MASK       BIT(1)
+#define CQSPI_REG_INDIRECTRD_DONE_MASK         BIT(5)
+
+#define CQSPI_REG_INDIRECTRDWATERMARK          0x64
+#define CQSPI_REG_INDIRECTRDSTARTADDR          0x68
+#define CQSPI_REG_INDIRECTRDBYTES              0x6C
+
+#define CQSPI_REG_CMDCTRL                      0x90
+#define CQSPI_REG_CMDCTRL_EXECUTE_MASK         BIT(0)
+#define CQSPI_REG_CMDCTRL_INPROGRESS_MASK      BIT(1)
+#define CQSPI_REG_CMDCTRL_WR_BYTES_LSB         12
+#define CQSPI_REG_CMDCTRL_WR_EN_LSB            15
+#define CQSPI_REG_CMDCTRL_ADD_BYTES_LSB                16
+#define CQSPI_REG_CMDCTRL_ADDR_EN_LSB          19
+#define CQSPI_REG_CMDCTRL_RD_BYTES_LSB         20
+#define CQSPI_REG_CMDCTRL_RD_EN_LSB            23
+#define CQSPI_REG_CMDCTRL_OPCODE_LSB           24
+#define CQSPI_REG_CMDCTRL_WR_BYTES_MASK                0x7
+#define CQSPI_REG_CMDCTRL_ADD_BYTES_MASK       0x3
+#define CQSPI_REG_CMDCTRL_RD_BYTES_MASK                0x7
+
+#define CQSPI_REG_INDIRECTWR                   0x70
+#define CQSPI_REG_INDIRECTWR_START_MASK                BIT(0)
+#define CQSPI_REG_INDIRECTWR_CANCEL_MASK       BIT(1)
+#define CQSPI_REG_INDIRECTWR_DONE_MASK         BIT(5)
+
+#define CQSPI_REG_INDIRECTWRWATERMARK          0x74
+#define CQSPI_REG_INDIRECTWRSTARTADDR          0x78
+#define CQSPI_REG_INDIRECTWRBYTES              0x7C
+
+#define CQSPI_REG_CMDADDRESS                   0x94
+#define CQSPI_REG_CMDREADDATALOWER             0xA0
+#define CQSPI_REG_CMDREADDATAUPPER             0xA4
+#define CQSPI_REG_CMDWRITEDATALOWER            0xA8
+#define CQSPI_REG_CMDWRITEDATAUPPER            0xAC
+
+/* Interrupt status bits */
+#define CQSPI_REG_IRQ_MODE_ERR                 BIT(0)
+#define CQSPI_REG_IRQ_UNDERFLOW                        BIT(1)
+#define CQSPI_REG_IRQ_IND_COMP                 BIT(2)
+#define CQSPI_REG_IRQ_IND_RD_REJECT            BIT(3)
+#define CQSPI_REG_IRQ_WR_PROTECTED_ERR         BIT(4)
+#define CQSPI_REG_IRQ_ILLEGAL_AHB_ERR          BIT(5)
+#define CQSPI_REG_IRQ_WATERMARK                        BIT(6)
+#define CQSPI_REG_IRQ_IND_SRAM_FULL            BIT(12)
+
+#define CQSPI_IRQ_MASK_RD              (CQSPI_REG_IRQ_WATERMARK        | \
+                                        CQSPI_REG_IRQ_IND_SRAM_FULL    | \
+                                        CQSPI_REG_IRQ_IND_COMP)
+
+#define CQSPI_IRQ_MASK_WR              (CQSPI_REG_IRQ_IND_COMP         | \
+                                        CQSPI_REG_IRQ_WATERMARK        | \
+                                        CQSPI_REG_IRQ_UNDERFLOW)
+
+#define CQSPI_IRQ_STATUS_MASK          0x1FFFF
+
+static int cqspi_wait_for_bit(void __iomem *reg, const u32 mask, bool clear)
+{
+       unsigned long end = jiffies + msecs_to_jiffies(CQSPI_TIMEOUT_MS);
+       u32 val;
+
+       while (1) {
+               val = readl(reg);
+               if (clear)
+                       val = ~val;
+               val &= mask;
+
+               if (val == mask)
+                       return 0;
+
+               if (time_after(jiffies, end))
+                       return -ETIMEDOUT;
+       }
+}
+
+static bool cqspi_is_idle(struct cqspi_st *cqspi)
+{
+       u32 reg = readl(cqspi->iobase + CQSPI_REG_CONFIG);
+
+       return reg & (1 << CQSPI_REG_CONFIG_IDLE_LSB);
+}
+
+static u32 cqspi_get_rd_sram_level(struct cqspi_st *cqspi)
+{
+       u32 reg = readl(cqspi->iobase + CQSPI_REG_SDRAMLEVEL);
+
+       reg >>= CQSPI_REG_SDRAMLEVEL_RD_LSB;
+       return reg & CQSPI_REG_SDRAMLEVEL_RD_MASK;
+}
+
+static irqreturn_t cqspi_irq_handler(int this_irq, void *dev)
+{
+       struct cqspi_st *cqspi = dev;
+       unsigned int irq_status;
+
+       /* Read interrupt status */
+       irq_status = readl(cqspi->iobase + CQSPI_REG_IRQSTATUS);
+
+       /* Clear interrupt */
+       writel(irq_status, cqspi->iobase + CQSPI_REG_IRQSTATUS);
+
+       irq_status &= CQSPI_IRQ_MASK_RD | CQSPI_IRQ_MASK_WR;
+
+       if (irq_status)
+               complete(&cqspi->transfer_complete);
+
+       return IRQ_HANDLED;
+}
+
+static unsigned int cqspi_calc_rdreg(struct spi_nor *nor, const u8 opcode)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       u32 rdreg = 0;
+
+       rdreg |= f_pdata->inst_width << CQSPI_REG_RD_INSTR_TYPE_INSTR_LSB;
+       rdreg |= f_pdata->addr_width << CQSPI_REG_RD_INSTR_TYPE_ADDR_LSB;
+       rdreg |= f_pdata->data_width << CQSPI_REG_RD_INSTR_TYPE_DATA_LSB;
+
+       return rdreg;
+}
+
+static int cqspi_wait_idle(struct cqspi_st *cqspi)
+{
+       const unsigned int poll_idle_retry = 3;
+       unsigned int count = 0;
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(CQSPI_TIMEOUT_MS);
+       while (1) {
+               /*
+                * Read few times in succession to ensure the controller
+                * is indeed idle, that is, the bit does not transition
+                * low again.
+                */
+               if (cqspi_is_idle(cqspi))
+                       count++;
+               else
+                       count = 0;
+
+               if (count >= poll_idle_retry)
+                       return 0;
+
+               if (time_after(jiffies, timeout)) {
+                       /* Timeout, in busy mode. */
+                       dev_err(&cqspi->pdev->dev,
+                               "QSPI is still busy after %dms timeout.\n",
+                               CQSPI_TIMEOUT_MS);
+                       return -ETIMEDOUT;
+               }
+
+               cpu_relax();
+       }
+}
+
+static int cqspi_exec_flash_cmd(struct cqspi_st *cqspi, unsigned int reg)
+{
+       void __iomem *reg_base = cqspi->iobase;
+       int ret;
+
+       /* Write the CMDCTRL without start execution. */
+       writel(reg, reg_base + CQSPI_REG_CMDCTRL);
+       /* Start execute */
+       reg |= CQSPI_REG_CMDCTRL_EXECUTE_MASK;
+       writel(reg, reg_base + CQSPI_REG_CMDCTRL);
+
+       /* Polling for completion. */
+       ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_CMDCTRL,
+                                CQSPI_REG_CMDCTRL_INPROGRESS_MASK, 1);
+       if (ret) {
+               dev_err(&cqspi->pdev->dev,
+                       "Flash command execution timed out.\n");
+               return ret;
+       }
+
+       /* Polling QSPI idle status. */
+       return cqspi_wait_idle(cqspi);
+}
+
+static int cqspi_command_read(struct spi_nor *nor,
+                             const u8 *txbuf, const unsigned n_tx,
+                             u8 *rxbuf, const unsigned n_rx)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *reg_base = cqspi->iobase;
+       unsigned int rdreg;
+       unsigned int reg;
+       unsigned int read_len;
+       int status;
+
+       if (!n_rx || n_rx > CQSPI_STIG_DATA_LEN_MAX || !rxbuf) {
+               dev_err(nor->dev, "Invalid input argument, len %d rxbuf 0x%p\n",
+                       n_rx, rxbuf);
+               return -EINVAL;
+       }
+
+       reg = txbuf[0] << CQSPI_REG_CMDCTRL_OPCODE_LSB;
+
+       rdreg = cqspi_calc_rdreg(nor, txbuf[0]);
+       writel(rdreg, reg_base + CQSPI_REG_RD_INSTR);
+
+       reg |= (0x1 << CQSPI_REG_CMDCTRL_RD_EN_LSB);
+
+       /* 0 means 1 byte. */
+       reg |= (((n_rx - 1) & CQSPI_REG_CMDCTRL_RD_BYTES_MASK)
+               << CQSPI_REG_CMDCTRL_RD_BYTES_LSB);
+       status = cqspi_exec_flash_cmd(cqspi, reg);
+       if (status)
+               return status;
+
+       reg = readl(reg_base + CQSPI_REG_CMDREADDATALOWER);
+
+       /* Put the read value into rx_buf */
+       read_len = (n_rx > 4) ? 4 : n_rx;
+       memcpy(rxbuf, &reg, read_len);
+       rxbuf += read_len;
+
+       if (n_rx > 4) {
+               reg = readl(reg_base + CQSPI_REG_CMDREADDATAUPPER);
+
+               read_len = n_rx - read_len;
+               memcpy(rxbuf, &reg, read_len);
+       }
+
+       return 0;
+}
+
+static int cqspi_command_write(struct spi_nor *nor, const u8 opcode,
+                              const u8 *txbuf, const unsigned n_tx)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *reg_base = cqspi->iobase;
+       unsigned int reg;
+       unsigned int data;
+       int ret;
+
+       if (n_tx > 4 || (n_tx && !txbuf)) {
+               dev_err(nor->dev,
+                       "Invalid input argument, cmdlen %d txbuf 0x%p\n",
+                       n_tx, txbuf);
+               return -EINVAL;
+       }
+
+       reg = opcode << CQSPI_REG_CMDCTRL_OPCODE_LSB;
+       if (n_tx) {
+               reg |= (0x1 << CQSPI_REG_CMDCTRL_WR_EN_LSB);
+               reg |= ((n_tx - 1) & CQSPI_REG_CMDCTRL_WR_BYTES_MASK)
+                       << CQSPI_REG_CMDCTRL_WR_BYTES_LSB;
+               data = 0;
+               memcpy(&data, txbuf, n_tx);
+               writel(data, reg_base + CQSPI_REG_CMDWRITEDATALOWER);
+       }
+
+       ret = cqspi_exec_flash_cmd(cqspi, reg);
+       return ret;
+}
+
+static int cqspi_command_write_addr(struct spi_nor *nor,
+                                   const u8 opcode, const unsigned int addr)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *reg_base = cqspi->iobase;
+       unsigned int reg;
+
+       reg = opcode << CQSPI_REG_CMDCTRL_OPCODE_LSB;
+       reg |= (0x1 << CQSPI_REG_CMDCTRL_ADDR_EN_LSB);
+       reg |= ((nor->addr_width - 1) & CQSPI_REG_CMDCTRL_ADD_BYTES_MASK)
+               << CQSPI_REG_CMDCTRL_ADD_BYTES_LSB;
+
+       writel(addr, reg_base + CQSPI_REG_CMDADDRESS);
+
+       return cqspi_exec_flash_cmd(cqspi, reg);
+}
+
+static int cqspi_indirect_read_setup(struct spi_nor *nor,
+                                    const unsigned int from_addr)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *reg_base = cqspi->iobase;
+       unsigned int dummy_clk = 0;
+       unsigned int reg;
+
+       writel(from_addr, reg_base + CQSPI_REG_INDIRECTRDSTARTADDR);
+
+       reg = nor->read_opcode << CQSPI_REG_RD_INSTR_OPCODE_LSB;
+       reg |= cqspi_calc_rdreg(nor, nor->read_opcode);
+
+       /* Setup dummy clock cycles */
+       dummy_clk = nor->read_dummy;
+       if (dummy_clk > CQSPI_DUMMY_CLKS_MAX)
+               dummy_clk = CQSPI_DUMMY_CLKS_MAX;
+
+       if (dummy_clk / 8) {
+               reg |= (1 << CQSPI_REG_RD_INSTR_MODE_EN_LSB);
+               /* Set mode bits high to ensure chip doesn't enter XIP */
+               writel(0xFF, reg_base + CQSPI_REG_MODE_BIT);
+
+               /* Need to subtract the mode byte (8 clocks). */
+               if (f_pdata->inst_width != CQSPI_INST_TYPE_QUAD)
+                       dummy_clk -= 8;
+
+               if (dummy_clk)
+                       reg |= (dummy_clk & CQSPI_REG_RD_INSTR_DUMMY_MASK)
+                              << CQSPI_REG_RD_INSTR_DUMMY_LSB;
+       }
+
+       writel(reg, reg_base + CQSPI_REG_RD_INSTR);
+
+       /* Set address width */
+       reg = readl(reg_base + CQSPI_REG_SIZE);
+       reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
+       reg |= (nor->addr_width - 1);
+       writel(reg, reg_base + CQSPI_REG_SIZE);
+       return 0;
+}
+
+static int cqspi_indirect_read_execute(struct spi_nor *nor,
+                                      u8 *rxbuf, const unsigned n_rx)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *reg_base = cqspi->iobase;
+       void __iomem *ahb_base = cqspi->ahb_base;
+       unsigned int remaining = n_rx;
+       unsigned int bytes_to_read = 0;
+       int ret = 0;
+
+       writel(remaining, reg_base + CQSPI_REG_INDIRECTRDBYTES);
+
+       /* Clear all interrupts. */
+       writel(CQSPI_IRQ_STATUS_MASK, reg_base + CQSPI_REG_IRQSTATUS);
+
+       writel(CQSPI_IRQ_MASK_RD, reg_base + CQSPI_REG_IRQMASK);
+
+       reinit_completion(&cqspi->transfer_complete);
+       writel(CQSPI_REG_INDIRECTRD_START_MASK,
+              reg_base + CQSPI_REG_INDIRECTRD);
+
+       while (remaining > 0) {
+               ret = wait_for_completion_timeout(&cqspi->transfer_complete,
+                                                 msecs_to_jiffies
+                                                 (CQSPI_READ_TIMEOUT_MS));
+
+               bytes_to_read = cqspi_get_rd_sram_level(cqspi);
+
+               if (!ret && bytes_to_read == 0) {
+                       dev_err(nor->dev, "Indirect read timeout, no bytes\n");
+                       ret = -ETIMEDOUT;
+                       goto failrd;
+               }
+
+               while (bytes_to_read != 0) {
+                       bytes_to_read *= cqspi->fifo_width;
+                       bytes_to_read = bytes_to_read > remaining ?
+                                       remaining : bytes_to_read;
+                       readsl(ahb_base, rxbuf, DIV_ROUND_UP(bytes_to_read, 4));
+                       rxbuf += bytes_to_read;
+                       remaining -= bytes_to_read;
+                       bytes_to_read = cqspi_get_rd_sram_level(cqspi);
+               }
+
+               if (remaining > 0)
+                       reinit_completion(&cqspi->transfer_complete);
+       }
+
+       /* Check indirect done status */
+       ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTRD,
+                                CQSPI_REG_INDIRECTRD_DONE_MASK, 0);
+       if (ret) {
+               dev_err(nor->dev,
+                       "Indirect read completion error (%i)\n", ret);
+               goto failrd;
+       }
+
+       /* Disable interrupt */
+       writel(0, reg_base + CQSPI_REG_IRQMASK);
+
+       /* Clear indirect completion status */
+       writel(CQSPI_REG_INDIRECTRD_DONE_MASK, reg_base + CQSPI_REG_INDIRECTRD);
+
+       return 0;
+
+failrd:
+       /* Disable interrupt */
+       writel(0, reg_base + CQSPI_REG_IRQMASK);
+
+       /* Cancel the indirect read */
+       writel(CQSPI_REG_INDIRECTWR_CANCEL_MASK,
+              reg_base + CQSPI_REG_INDIRECTRD);
+       return ret;
+}
+
+static int cqspi_indirect_write_setup(struct spi_nor *nor,
+                                     const unsigned int to_addr)
+{
+       unsigned int reg;
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *reg_base = cqspi->iobase;
+
+       /* Set opcode. */
+       reg = nor->program_opcode << CQSPI_REG_WR_INSTR_OPCODE_LSB;
+       writel(reg, reg_base + CQSPI_REG_WR_INSTR);
+       reg = cqspi_calc_rdreg(nor, nor->program_opcode);
+       writel(reg, reg_base + CQSPI_REG_RD_INSTR);
+
+       writel(to_addr, reg_base + CQSPI_REG_INDIRECTWRSTARTADDR);
+
+       reg = readl(reg_base + CQSPI_REG_SIZE);
+       reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
+       reg |= (nor->addr_width - 1);
+       writel(reg, reg_base + CQSPI_REG_SIZE);
+       return 0;
+}
+
+static int cqspi_indirect_write_execute(struct spi_nor *nor,
+                                       const u8 *txbuf, const unsigned n_tx)
+{
+       const unsigned int page_size = nor->page_size;
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *reg_base = cqspi->iobase;
+       unsigned int remaining = n_tx;
+       unsigned int write_bytes;
+       int ret;
+
+       writel(remaining, reg_base + CQSPI_REG_INDIRECTWRBYTES);
+
+       /* Clear all interrupts. */
+       writel(CQSPI_IRQ_STATUS_MASK, reg_base + CQSPI_REG_IRQSTATUS);
+
+       writel(CQSPI_IRQ_MASK_WR, reg_base + CQSPI_REG_IRQMASK);
+
+       reinit_completion(&cqspi->transfer_complete);
+       writel(CQSPI_REG_INDIRECTWR_START_MASK,
+              reg_base + CQSPI_REG_INDIRECTWR);
+
+       while (remaining > 0) {
+               write_bytes = remaining > page_size ? page_size : remaining;
+               writesl(cqspi->ahb_base, txbuf, DIV_ROUND_UP(write_bytes, 4));
+
+               ret = wait_for_completion_timeout(&cqspi->transfer_complete,
+                                                 msecs_to_jiffies
+                                                 (CQSPI_TIMEOUT_MS));
+               if (!ret) {
+                       dev_err(nor->dev, "Indirect write timeout\n");
+                       ret = -ETIMEDOUT;
+                       goto failwr;
+               }
+
+               txbuf += write_bytes;
+               remaining -= write_bytes;
+
+               if (remaining > 0)
+                       reinit_completion(&cqspi->transfer_complete);
+       }
+
+       /* Check indirect done status */
+       ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTWR,
+                                CQSPI_REG_INDIRECTWR_DONE_MASK, 0);
+       if (ret) {
+               dev_err(nor->dev,
+                       "Indirect write completion error (%i)\n", ret);
+               goto failwr;
+       }
+
+       /* Disable interrupt. */
+       writel(0, reg_base + CQSPI_REG_IRQMASK);
+
+       /* Clear indirect completion status */
+       writel(CQSPI_REG_INDIRECTWR_DONE_MASK, reg_base + CQSPI_REG_INDIRECTWR);
+
+       cqspi_wait_idle(cqspi);
+
+       return 0;
+
+failwr:
+       /* Disable interrupt. */
+       writel(0, reg_base + CQSPI_REG_IRQMASK);
+
+       /* Cancel the indirect write */
+       writel(CQSPI_REG_INDIRECTWR_CANCEL_MASK,
+              reg_base + CQSPI_REG_INDIRECTWR);
+       return ret;
+}
+
+static void cqspi_chipselect(struct spi_nor *nor)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *reg_base = cqspi->iobase;
+       unsigned int chip_select = f_pdata->cs;
+       unsigned int reg;
+
+       reg = readl(reg_base + CQSPI_REG_CONFIG);
+       if (cqspi->is_decoded_cs) {
+               reg |= CQSPI_REG_CONFIG_DECODE_MASK;
+       } else {
+               reg &= ~CQSPI_REG_CONFIG_DECODE_MASK;
+
+               /* Convert CS if without decoder.
+                * CS0 to 4b'1110
+                * CS1 to 4b'1101
+                * CS2 to 4b'1011
+                * CS3 to 4b'0111
+                */
+               chip_select = 0xF & ~(1 << chip_select);
+       }
+
+       reg &= ~(CQSPI_REG_CONFIG_CHIPSELECT_MASK
+                << CQSPI_REG_CONFIG_CHIPSELECT_LSB);
+       reg |= (chip_select & CQSPI_REG_CONFIG_CHIPSELECT_MASK)
+           << CQSPI_REG_CONFIG_CHIPSELECT_LSB;
+       writel(reg, reg_base + CQSPI_REG_CONFIG);
+}
+
+static void cqspi_configure_cs_and_sizes(struct spi_nor *nor)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *iobase = cqspi->iobase;
+       unsigned int reg;
+
+       /* configure page size and block size. */
+       reg = readl(iobase + CQSPI_REG_SIZE);
+       reg &= ~(CQSPI_REG_SIZE_PAGE_MASK << CQSPI_REG_SIZE_PAGE_LSB);
+       reg &= ~(CQSPI_REG_SIZE_BLOCK_MASK << CQSPI_REG_SIZE_BLOCK_LSB);
+       reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
+       reg |= (nor->page_size << CQSPI_REG_SIZE_PAGE_LSB);
+       reg |= (ilog2(nor->mtd.erasesize) << CQSPI_REG_SIZE_BLOCK_LSB);
+       reg |= (nor->addr_width - 1);
+       writel(reg, iobase + CQSPI_REG_SIZE);
+
+       /* configure the chip select */
+       cqspi_chipselect(nor);
+
+       /* Store the new configuration of the controller */
+       cqspi->current_page_size = nor->page_size;
+       cqspi->current_erase_size = nor->mtd.erasesize;
+       cqspi->current_addr_width = nor->addr_width;
+}
+
+static unsigned int calculate_ticks_for_ns(const unsigned int ref_clk_hz,
+                                          const unsigned int ns_val)
+{
+       unsigned int ticks;
+
+       ticks = ref_clk_hz / 1000;      /* kHz */
+       ticks = DIV_ROUND_UP(ticks * ns_val, 1000000);
+
+       return ticks;
+}
+
+static void cqspi_delay(struct spi_nor *nor)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       void __iomem *iobase = cqspi->iobase;
+       const unsigned int ref_clk_hz = cqspi->master_ref_clk_hz;
+       unsigned int tshsl, tchsh, tslch, tsd2d;
+       unsigned int reg;
+       unsigned int tsclk;
+
+       /* calculate the number of ref ticks for one sclk tick */
+       tsclk = DIV_ROUND_UP(ref_clk_hz, cqspi->sclk);
+
+       tshsl = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tshsl_ns);
+       /* this particular value must be at least one sclk */
+       if (tshsl < tsclk)
+               tshsl = tsclk;
+
+       tchsh = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tchsh_ns);
+       tslch = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tslch_ns);
+       tsd2d = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tsd2d_ns);
+
+       reg = (tshsl & CQSPI_REG_DELAY_TSHSL_MASK)
+              << CQSPI_REG_DELAY_TSHSL_LSB;
+       reg |= (tchsh & CQSPI_REG_DELAY_TCHSH_MASK)
+               << CQSPI_REG_DELAY_TCHSH_LSB;
+       reg |= (tslch & CQSPI_REG_DELAY_TSLCH_MASK)
+               << CQSPI_REG_DELAY_TSLCH_LSB;
+       reg |= (tsd2d & CQSPI_REG_DELAY_TSD2D_MASK)
+               << CQSPI_REG_DELAY_TSD2D_LSB;
+       writel(reg, iobase + CQSPI_REG_DELAY);
+}
+
+static void cqspi_config_baudrate_div(struct cqspi_st *cqspi)
+{
+       const unsigned int ref_clk_hz = cqspi->master_ref_clk_hz;
+       void __iomem *reg_base = cqspi->iobase;
+       u32 reg, div;
+
+       /* Recalculate the baudrate divisor based on QSPI specification. */
+       div = DIV_ROUND_UP(ref_clk_hz, 2 * cqspi->sclk) - 1;
+
+       reg = readl(reg_base + CQSPI_REG_CONFIG);
+       reg &= ~(CQSPI_REG_CONFIG_BAUD_MASK << CQSPI_REG_CONFIG_BAUD_LSB);
+       reg |= (div & CQSPI_REG_CONFIG_BAUD_MASK) << CQSPI_REG_CONFIG_BAUD_LSB;
+       writel(reg, reg_base + CQSPI_REG_CONFIG);
+}
+
+static void cqspi_readdata_capture(struct cqspi_st *cqspi,
+                                  const unsigned int bypass,
+                                  const unsigned int delay)
+{
+       void __iomem *reg_base = cqspi->iobase;
+       unsigned int reg;
+
+       reg = readl(reg_base + CQSPI_REG_READCAPTURE);
+
+       if (bypass)
+               reg |= (1 << CQSPI_REG_READCAPTURE_BYPASS_LSB);
+       else
+               reg &= ~(1 << CQSPI_REG_READCAPTURE_BYPASS_LSB);
+
+       reg &= ~(CQSPI_REG_READCAPTURE_DELAY_MASK
+                << CQSPI_REG_READCAPTURE_DELAY_LSB);
+
+       reg |= (delay & CQSPI_REG_READCAPTURE_DELAY_MASK)
+               << CQSPI_REG_READCAPTURE_DELAY_LSB;
+
+       writel(reg, reg_base + CQSPI_REG_READCAPTURE);
+}
+
+static void cqspi_controller_enable(struct cqspi_st *cqspi, bool enable)
+{
+       void __iomem *reg_base = cqspi->iobase;
+       unsigned int reg;
+
+       reg = readl(reg_base + CQSPI_REG_CONFIG);
+
+       if (enable)
+               reg |= CQSPI_REG_CONFIG_ENABLE_MASK;
+       else
+               reg &= ~CQSPI_REG_CONFIG_ENABLE_MASK;
+
+       writel(reg, reg_base + CQSPI_REG_CONFIG);
+}
+
+static void cqspi_configure(struct spi_nor *nor)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+       const unsigned int sclk = f_pdata->clk_rate;
+       int switch_cs = (cqspi->current_cs != f_pdata->cs);
+       int switch_ck = (cqspi->sclk != sclk);
+
+       if ((cqspi->current_page_size != nor->page_size) ||
+           (cqspi->current_erase_size != nor->mtd.erasesize) ||
+           (cqspi->current_addr_width != nor->addr_width))
+               switch_cs = 1;
+
+       if (switch_cs || switch_ck)
+               cqspi_controller_enable(cqspi, 0);
+
+       /* Switch chip select. */
+       if (switch_cs) {
+               cqspi->current_cs = f_pdata->cs;
+               cqspi_configure_cs_and_sizes(nor);
+       }
+
+       /* Setup baudrate divisor and delays */
+       if (switch_ck) {
+               cqspi->sclk = sclk;
+               cqspi_config_baudrate_div(cqspi);
+               cqspi_delay(nor);
+               cqspi_readdata_capture(cqspi, 1, f_pdata->read_delay);
+       }
+
+       if (switch_cs || switch_ck)
+               cqspi_controller_enable(cqspi, 1);
+}
+
+static int cqspi_set_protocol(struct spi_nor *nor, const int read)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+
+       f_pdata->inst_width = CQSPI_INST_TYPE_SINGLE;
+       f_pdata->addr_width = CQSPI_INST_TYPE_SINGLE;
+       f_pdata->data_width = CQSPI_INST_TYPE_SINGLE;
+
+       if (read) {
+               switch (nor->flash_read) {
+               case SPI_NOR_NORMAL:
+               case SPI_NOR_FAST:
+                       f_pdata->data_width = CQSPI_INST_TYPE_SINGLE;
+                       break;
+               case SPI_NOR_DUAL:
+                       f_pdata->data_width = CQSPI_INST_TYPE_DUAL;
+                       break;
+               case SPI_NOR_QUAD:
+                       f_pdata->data_width = CQSPI_INST_TYPE_QUAD;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+       }
+
+       cqspi_configure(nor);
+
+       return 0;
+}
+
+static ssize_t cqspi_write(struct spi_nor *nor, loff_t to,
+                          size_t len, const u_char *buf)
+{
+       int ret;
+
+       ret = cqspi_set_protocol(nor, 0);
+       if (ret)
+               return ret;
+
+       ret = cqspi_indirect_write_setup(nor, to);
+       if (ret)
+               return ret;
+
+       ret = cqspi_indirect_write_execute(nor, buf, len);
+       if (ret)
+               return ret;
+
+       return (ret < 0) ? ret : len;
+}
+
+static ssize_t cqspi_read(struct spi_nor *nor, loff_t from,
+                         size_t len, u_char *buf)
+{
+       int ret;
+
+       ret = cqspi_set_protocol(nor, 1);
+       if (ret)
+               return ret;
+
+       ret = cqspi_indirect_read_setup(nor, from);
+       if (ret)
+               return ret;
+
+       ret = cqspi_indirect_read_execute(nor, buf, len);
+       if (ret)
+               return ret;
+
+       return (ret < 0) ? ret : len;
+}
+
+static int cqspi_erase(struct spi_nor *nor, loff_t offs)
+{
+       int ret;
+
+       ret = cqspi_set_protocol(nor, 0);
+       if (ret)
+               return ret;
+
+       /* Send write enable, then erase commands. */
+       ret = nor->write_reg(nor, SPINOR_OP_WREN, NULL, 0);
+       if (ret)
+               return ret;
+
+       /* Set up command buffer. */
+       ret = cqspi_command_write_addr(nor, nor->erase_opcode, offs);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+static int cqspi_prep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+
+       mutex_lock(&cqspi->bus_mutex);
+
+       return 0;
+}
+
+static void cqspi_unprep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+       struct cqspi_flash_pdata *f_pdata = nor->priv;
+       struct cqspi_st *cqspi = f_pdata->cqspi;
+
+       mutex_unlock(&cqspi->bus_mutex);
+}
+
+static int cqspi_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
+{
+       int ret;
+
+       ret = cqspi_set_protocol(nor, 0);
+       if (!ret)
+               ret = cqspi_command_read(nor, &opcode, 1, buf, len);
+
+       return ret;
+}
+
+static int cqspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
+{
+       int ret;
+
+       ret = cqspi_set_protocol(nor, 0);
+       if (!ret)
+               ret = cqspi_command_write(nor, opcode, buf, len);
+
+       return ret;
+}
+
+static int cqspi_of_get_flash_pdata(struct platform_device *pdev,
+                                   struct cqspi_flash_pdata *f_pdata,
+                                   struct device_node *np)
+{
+       if (of_property_read_u32(np, "cdns,read-delay", &f_pdata->read_delay)) {
+               dev_err(&pdev->dev, "couldn't determine read-delay\n");
+               return -ENXIO;
+       }
+
+       if (of_property_read_u32(np, "cdns,tshsl-ns", &f_pdata->tshsl_ns)) {
+               dev_err(&pdev->dev, "couldn't determine tshsl-ns\n");
+               return -ENXIO;
+       }
+
+       if (of_property_read_u32(np, "cdns,tsd2d-ns", &f_pdata->tsd2d_ns)) {
+               dev_err(&pdev->dev, "couldn't determine tsd2d-ns\n");
+               return -ENXIO;
+       }
+
+       if (of_property_read_u32(np, "cdns,tchsh-ns", &f_pdata->tchsh_ns)) {
+               dev_err(&pdev->dev, "couldn't determine tchsh-ns\n");
+               return -ENXIO;
+       }
+
+       if (of_property_read_u32(np, "cdns,tslch-ns", &f_pdata->tslch_ns)) {
+               dev_err(&pdev->dev, "couldn't determine tslch-ns\n");
+               return -ENXIO;
+       }
+
+       if (of_property_read_u32(np, "spi-max-frequency", &f_pdata->clk_rate)) {
+               dev_err(&pdev->dev, "couldn't determine spi-max-frequency\n");
+               return -ENXIO;
+       }
+
+       return 0;
+}
+
+static int cqspi_of_get_pdata(struct platform_device *pdev)
+{
+       struct device_node *np = pdev->dev.of_node;
+       struct cqspi_st *cqspi = platform_get_drvdata(pdev);
+
+       cqspi->is_decoded_cs = of_property_read_bool(np, "cdns,is-decoded-cs");
+
+       if (of_property_read_u32(np, "cdns,fifo-depth", &cqspi->fifo_depth)) {
+               dev_err(&pdev->dev, "couldn't determine fifo-depth\n");
+               return -ENXIO;
+       }
+
+       if (of_property_read_u32(np, "cdns,fifo-width", &cqspi->fifo_width)) {
+               dev_err(&pdev->dev, "couldn't determine fifo-width\n");
+               return -ENXIO;
+       }
+
+       if (of_property_read_u32(np, "cdns,trigger-address",
+                                &cqspi->trigger_address)) {
+               dev_err(&pdev->dev, "couldn't determine trigger-address\n");
+               return -ENXIO;
+       }
+
+       return 0;
+}
+
+static void cqspi_controller_init(struct cqspi_st *cqspi)
+{
+       cqspi_controller_enable(cqspi, 0);
+
+       /* Configure the remap address register, no remap */
+       writel(0, cqspi->iobase + CQSPI_REG_REMAP);
+
+       /* Disable all interrupts. */
+       writel(0, cqspi->iobase + CQSPI_REG_IRQMASK);
+
+       /* Configure the SRAM split to 1:1 . */
+       writel(cqspi->fifo_depth / 2, cqspi->iobase + CQSPI_REG_SRAMPARTITION);
+
+       /* Load indirect trigger address. */
+       writel(cqspi->trigger_address,
+              cqspi->iobase + CQSPI_REG_INDIRECTTRIGGER);
+
+       /* Program read watermark -- 1/2 of the FIFO. */
+       writel(cqspi->fifo_depth * cqspi->fifo_width / 2,
+              cqspi->iobase + CQSPI_REG_INDIRECTRDWATERMARK);
+       /* Program write watermark -- 1/8 of the FIFO. */
+       writel(cqspi->fifo_depth * cqspi->fifo_width / 8,
+              cqspi->iobase + CQSPI_REG_INDIRECTWRWATERMARK);
+
+       cqspi_controller_enable(cqspi, 1);
+}
+
+static int cqspi_setup_flash(struct cqspi_st *cqspi, struct device_node *np)
+{
+       struct platform_device *pdev = cqspi->pdev;
+       struct device *dev = &pdev->dev;
+       struct cqspi_flash_pdata *f_pdata;
+       struct spi_nor *nor;
+       struct mtd_info *mtd;
+       unsigned int cs;
+       int i, ret;
+
+       /* Get flash device data */
+       for_each_available_child_of_node(dev->of_node, np) {
+               if (of_property_read_u32(np, "reg", &cs)) {
+                       dev_err(dev, "Couldn't determine chip select.\n");
+                       goto err;
+               }
+
+               if (cs > CQSPI_MAX_CHIPSELECT) {
+                       dev_err(dev, "Chip select %d out of range.\n", cs);
+                       goto err;
+               }
+
+               f_pdata = &cqspi->f_pdata[cs];
+               f_pdata->cqspi = cqspi;
+               f_pdata->cs = cs;
+
+               ret = cqspi_of_get_flash_pdata(pdev, f_pdata, np);
+               if (ret)
+                       goto err;
+
+               nor = &f_pdata->nor;
+               mtd = &nor->mtd;
+
+               mtd->priv = nor;
+
+               nor->dev = dev;
+               spi_nor_set_flash_node(nor, np);
+               nor->priv = f_pdata;
+
+               nor->read_reg = cqspi_read_reg;
+               nor->write_reg = cqspi_write_reg;
+               nor->read = cqspi_read;
+               nor->write = cqspi_write;
+               nor->erase = cqspi_erase;
+               nor->prepare = cqspi_prep;
+               nor->unprepare = cqspi_unprep;
+
+               mtd->name = devm_kasprintf(dev, GFP_KERNEL, "%s.%d",
+                                          dev_name(dev), cs);
+               if (!mtd->name) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               ret = spi_nor_scan(nor, NULL, SPI_NOR_QUAD);
+               if (ret)
+                       goto err;
+
+               ret = mtd_device_register(mtd, NULL, 0);
+               if (ret)
+                       goto err;
+
+               f_pdata->registered = true;
+       }
+
+       return 0;
+
+err:
+       for (i = 0; i < CQSPI_MAX_CHIPSELECT; i++)
+               if (cqspi->f_pdata[i].registered)
+                       mtd_device_unregister(&cqspi->f_pdata[i].nor.mtd);
+       return ret;
+}
+
+static int cqspi_probe(struct platform_device *pdev)
+{
+       struct device_node *np = pdev->dev.of_node;
+       struct device *dev = &pdev->dev;
+       struct cqspi_st *cqspi;
+       struct resource *res;
+       struct resource *res_ahb;
+       int ret;
+       int irq;
+
+       cqspi = devm_kzalloc(dev, sizeof(*cqspi), GFP_KERNEL);
+       if (!cqspi)
+               return -ENOMEM;
+
+       mutex_init(&cqspi->bus_mutex);
+       cqspi->pdev = pdev;
+       platform_set_drvdata(pdev, cqspi);
+
+       /* Obtain configuration from OF. */
+       ret = cqspi_of_get_pdata(pdev);
+       if (ret) {
+               dev_err(dev, "Cannot get mandatory OF data.\n");
+               return -ENODEV;
+       }
+
+       /* Obtain QSPI clock. */
+       cqspi->clk = devm_clk_get(dev, NULL);
+       if (IS_ERR(cqspi->clk)) {
+               dev_err(dev, "Cannot claim QSPI clock.\n");
+               return PTR_ERR(cqspi->clk);
+       }
+
+       /* Obtain and remap controller address. */
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       cqspi->iobase = devm_ioremap_resource(dev, res);
+       if (IS_ERR(cqspi->iobase)) {
+               dev_err(dev, "Cannot remap controller address.\n");
+               return PTR_ERR(cqspi->iobase);
+       }
+
+       /* Obtain and remap AHB address. */
+       res_ahb = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+       cqspi->ahb_base = devm_ioremap_resource(dev, res_ahb);
+       if (IS_ERR(cqspi->ahb_base)) {
+               dev_err(dev, "Cannot remap AHB address.\n");
+               return PTR_ERR(cqspi->ahb_base);
+       }
+
+       init_completion(&cqspi->transfer_complete);
+
+       /* Obtain IRQ line. */
+       irq = platform_get_irq(pdev, 0);
+       if (irq < 0) {
+               dev_err(dev, "Cannot obtain IRQ.\n");
+               return -ENXIO;
+       }
+
+       ret = clk_prepare_enable(cqspi->clk);
+       if (ret) {
+               dev_err(dev, "Cannot enable QSPI clock.\n");
+               return ret;
+       }
+
+       cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk);
+
+       ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0,
+                              pdev->name, cqspi);
+       if (ret) {
+               dev_err(dev, "Cannot request IRQ.\n");
+               goto probe_irq_failed;
+       }
+
+       cqspi_wait_idle(cqspi);
+       cqspi_controller_init(cqspi);
+       cqspi->current_cs = -1;
+       cqspi->sclk = 0;
+
+       ret = cqspi_setup_flash(cqspi, np);
+       if (ret) {
+               dev_err(dev, "Cadence QSPI NOR probe failed %d\n", ret);
+               goto probe_setup_failed;
+       }
+
+       return ret;
+probe_irq_failed:
+       cqspi_controller_enable(cqspi, 0);
+probe_setup_failed:
+       clk_disable_unprepare(cqspi->clk);
+       return ret;
+}
+
+static int cqspi_remove(struct platform_device *pdev)
+{
+       struct cqspi_st *cqspi = platform_get_drvdata(pdev);
+       int i;
+
+       for (i = 0; i < CQSPI_MAX_CHIPSELECT; i++)
+               if (cqspi->f_pdata[i].registered)
+                       mtd_device_unregister(&cqspi->f_pdata[i].nor.mtd);
+
+       cqspi_controller_enable(cqspi, 0);
+
+       clk_disable_unprepare(cqspi->clk);
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int cqspi_suspend(struct device *dev)
+{
+       struct cqspi_st *cqspi = dev_get_drvdata(dev);
+
+       cqspi_controller_enable(cqspi, 0);
+       return 0;
+}
+
+static int cqspi_resume(struct device *dev)
+{
+       struct cqspi_st *cqspi = dev_get_drvdata(dev);
+
+       cqspi_controller_enable(cqspi, 1);
+       return 0;
+}
+
+static const struct dev_pm_ops cqspi__dev_pm_ops = {
+       .suspend = cqspi_suspend,
+       .resume = cqspi_resume,
+};
+
+#define CQSPI_DEV_PM_OPS       (&cqspi__dev_pm_ops)
+#else
+#define CQSPI_DEV_PM_OPS       NULL
+#endif
+
+static struct of_device_id const cqspi_dt_ids[] = {
+       {.compatible = "cdns,qspi-nor",},
+       { /* end of table */ }
+};
+
+MODULE_DEVICE_TABLE(of, cqspi_dt_ids);
+
+static struct platform_driver cqspi_platform_driver = {
+       .probe = cqspi_probe,
+       .remove = cqspi_remove,
+       .driver = {
+               .name = CQSPI_NAME,
+               .pm = CQSPI_DEV_PM_OPS,
+               .of_match_table = cqspi_dt_ids,
+       },
+};
+
+module_platform_driver(cqspi_platform_driver);
+
+MODULE_DESCRIPTION("Cadence QSPI Controller Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:" CQSPI_NAME);
+MODULE_AUTHOR("Ley Foon Tan <lftan@altera.com>");
+MODULE_AUTHOR("Graham Moore <grmoore@opensource.altera.com>");
index 9ab2b51..5c82e4e 100644 (file)
@@ -618,9 +618,9 @@ static inline void fsl_qspi_invalid(struct fsl_qspi *q)
        qspi_writel(q, reg, q->iobase + QUADSPI_MCR);
 }
 
-static int fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor,
+static ssize_t fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor,
                                u8 opcode, unsigned int to, u32 *txbuf,
-                               unsigned count, size_t *retlen)
+                               unsigned count)
 {
        int ret, i, j;
        u32 tmp;
@@ -647,8 +647,8 @@ static int fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor,
        /* Trigger it */
        ret = fsl_qspi_runcmd(q, opcode, to, count);
 
-       if (ret == 0 && retlen)
-               *retlen += count;
+       if (ret == 0)
+               return count;
 
        return ret;
 }
@@ -859,7 +859,9 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
 
        } else if (len > 0) {
                ret = fsl_qspi_nor_write(q, nor, opcode, 0,
-                                       (u32 *)buf, len, NULL);
+                                       (u32 *)buf, len);
+               if (ret > 0)
+                       return 0;
        } else {
                dev_err(q->dev, "invalid cmd %d\n", opcode);
                ret = -EINVAL;
@@ -868,20 +870,20 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
        return ret;
 }
 
-static void fsl_qspi_write(struct spi_nor *nor, loff_t to,
-               size_t len, size_t *retlen, const u_char *buf)
+static ssize_t fsl_qspi_write(struct spi_nor *nor, loff_t to,
+                             size_t len, const u_char *buf)
 {
        struct fsl_qspi *q = nor->priv;
-
-       fsl_qspi_nor_write(q, nor, nor->program_opcode, to,
-                               (u32 *)buf, len, retlen);
+       ssize_t ret = fsl_qspi_nor_write(q, nor, nor->program_opcode, to,
+                                        (u32 *)buf, len);
 
        /* invalid the data in the AHB buffer. */
        fsl_qspi_invalid(q);
+       return ret;
 }
 
-static int fsl_qspi_read(struct spi_nor *nor, loff_t from,
-               size_t len, size_t *retlen, u_char *buf)
+static ssize_t fsl_qspi_read(struct spi_nor *nor, loff_t from,
+                            size_t len, u_char *buf)
 {
        struct fsl_qspi *q = nor->priv;
        u8 cmd = nor->read_opcode;
@@ -923,8 +925,7 @@ static int fsl_qspi_read(struct spi_nor *nor, loff_t from,
        memcpy(buf, q->ahb_addr + q->chip_base_addr + from - q->memmap_offs,
                len);
 
-       *retlen += len;
-       return 0;
+       return len;
 }
 
 static int fsl_qspi_erase(struct spi_nor *nor, loff_t offs)
diff --git a/drivers/mtd/spi-nor/hisi-sfc.c b/drivers/mtd/spi-nor/hisi-sfc.c
new file mode 100644 (file)
index 0000000..20378b0
--- /dev/null
@@ -0,0 +1,489 @@
+/*
+ * HiSilicon SPI Nor Flash Controller Driver
+ *
+ * Copyright (c) 2015-2016 HiSilicon Technologies Co., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/spi-nor.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+/* Hardware register offsets and field definitions */
+#define FMC_CFG                                0x00
+#define FMC_CFG_OP_MODE_MASK           BIT_MASK(0)
+#define FMC_CFG_OP_MODE_BOOT           0
+#define FMC_CFG_OP_MODE_NORMAL         1
+#define FMC_CFG_FLASH_SEL(type)                (((type) & 0x3) << 1)
+#define FMC_CFG_FLASH_SEL_MASK         0x6
+#define FMC_ECC_TYPE(type)             (((type) & 0x7) << 5)
+#define FMC_ECC_TYPE_MASK              GENMASK(7, 5)
+#define SPI_NOR_ADDR_MODE_MASK         BIT_MASK(10)
+#define SPI_NOR_ADDR_MODE_3BYTES       (0x0 << 10)
+#define SPI_NOR_ADDR_MODE_4BYTES       (0x1 << 10)
+#define FMC_GLOBAL_CFG                 0x04
+#define FMC_GLOBAL_CFG_WP_ENABLE       BIT(6)
+#define FMC_SPI_TIMING_CFG             0x08
+#define TIMING_CFG_TCSH(nr)            (((nr) & 0xf) << 8)
+#define TIMING_CFG_TCSS(nr)            (((nr) & 0xf) << 4)
+#define TIMING_CFG_TSHSL(nr)           ((nr) & 0xf)
+#define CS_HOLD_TIME                   0x6
+#define CS_SETUP_TIME                  0x6
+#define CS_DESELECT_TIME               0xf
+#define FMC_INT                                0x18
+#define FMC_INT_OP_DONE                        BIT(0)
+#define FMC_INT_CLR                    0x20
+#define FMC_CMD                                0x24
+#define FMC_CMD_CMD1(cmd)              ((cmd) & 0xff)
+#define FMC_ADDRL                      0x2c
+#define FMC_OP_CFG                     0x30
+#define OP_CFG_FM_CS(cs)               ((cs) << 11)
+#define OP_CFG_MEM_IF_TYPE(type)       (((type) & 0x7) << 7)
+#define OP_CFG_ADDR_NUM(addr)          (((addr) & 0x7) << 4)
+#define OP_CFG_DUMMY_NUM(dummy)                ((dummy) & 0xf)
+#define FMC_DATA_NUM                   0x38
+#define FMC_DATA_NUM_CNT(cnt)          ((cnt) & GENMASK(13, 0))
+#define FMC_OP                         0x3c
+#define FMC_OP_DUMMY_EN                        BIT(8)
+#define FMC_OP_CMD1_EN                 BIT(7)
+#define FMC_OP_ADDR_EN                 BIT(6)
+#define FMC_OP_WRITE_DATA_EN           BIT(5)
+#define FMC_OP_READ_DATA_EN            BIT(2)
+#define FMC_OP_READ_STATUS_EN          BIT(1)
+#define FMC_OP_REG_OP_START            BIT(0)
+#define FMC_DMA_LEN                    0x40
+#define FMC_DMA_LEN_SET(len)           ((len) & GENMASK(27, 0))
+#define FMC_DMA_SADDR_D0               0x4c
+#define HIFMC_DMA_MAX_LEN              (4096)
+#define HIFMC_DMA_MASK                 (HIFMC_DMA_MAX_LEN - 1)
+#define FMC_OP_DMA                     0x68
+#define OP_CTRL_RD_OPCODE(code)                (((code) & 0xff) << 16)
+#define OP_CTRL_WR_OPCODE(code)                (((code) & 0xff) << 8)
+#define OP_CTRL_RW_OP(op)              ((op) << 1)
+#define OP_CTRL_DMA_OP_READY           BIT(0)
+#define FMC_OP_READ                    0x0
+#define FMC_OP_WRITE                   0x1
+#define FMC_WAIT_TIMEOUT               1000000
+
+enum hifmc_iftype {
+       IF_TYPE_STD,
+       IF_TYPE_DUAL,
+       IF_TYPE_DIO,
+       IF_TYPE_QUAD,
+       IF_TYPE_QIO,
+};
+
+struct hifmc_priv {
+       u32 chipselect;
+       u32 clkrate;
+       struct hifmc_host *host;
+};
+
+#define HIFMC_MAX_CHIP_NUM             2
+struct hifmc_host {
+       struct device *dev;
+       struct mutex lock;
+
+       void __iomem *regbase;
+       void __iomem *iobase;
+       struct clk *clk;
+       void *buffer;
+       dma_addr_t dma_buffer;
+
+       struct spi_nor  *nor[HIFMC_MAX_CHIP_NUM];
+       u32 num_chip;
+};
+
+static inline int wait_op_finish(struct hifmc_host *host)
+{
+       u32 reg;
+
+       return readl_poll_timeout(host->regbase + FMC_INT, reg,
+               (reg & FMC_INT_OP_DONE), 0, FMC_WAIT_TIMEOUT);
+}
+
+static int get_if_type(enum read_mode flash_read)
+{
+       enum hifmc_iftype if_type;
+
+       switch (flash_read) {
+       case SPI_NOR_DUAL:
+               if_type = IF_TYPE_DUAL;
+               break;
+       case SPI_NOR_QUAD:
+               if_type = IF_TYPE_QUAD;
+               break;
+       case SPI_NOR_NORMAL:
+       case SPI_NOR_FAST:
+       default:
+               if_type = IF_TYPE_STD;
+               break;
+       }
+
+       return if_type;
+}
+
+static void hisi_spi_nor_init(struct hifmc_host *host)
+{
+       u32 reg;
+
+       reg = TIMING_CFG_TCSH(CS_HOLD_TIME)
+               | TIMING_CFG_TCSS(CS_SETUP_TIME)
+               | TIMING_CFG_TSHSL(CS_DESELECT_TIME);
+       writel(reg, host->regbase + FMC_SPI_TIMING_CFG);
+}
+
+static int hisi_spi_nor_prep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+       struct hifmc_priv *priv = nor->priv;
+       struct hifmc_host *host = priv->host;
+       int ret;
+
+       mutex_lock(&host->lock);
+
+       ret = clk_set_rate(host->clk, priv->clkrate);
+       if (ret)
+               goto out;
+
+       ret = clk_prepare_enable(host->clk);
+       if (ret)
+               goto out;
+
+       return 0;
+
+out:
+       mutex_unlock(&host->lock);
+       return ret;
+}
+
+static void hisi_spi_nor_unprep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+       struct hifmc_priv *priv = nor->priv;
+       struct hifmc_host *host = priv->host;
+
+       clk_disable_unprepare(host->clk);
+       mutex_unlock(&host->lock);
+}
+
+static int hisi_spi_nor_op_reg(struct spi_nor *nor,
+                               u8 opcode, int len, u8 optype)
+{
+       struct hifmc_priv *priv = nor->priv;
+       struct hifmc_host *host = priv->host;
+       u32 reg;
+
+       reg = FMC_CMD_CMD1(opcode);
+       writel(reg, host->regbase + FMC_CMD);
+
+       reg = FMC_DATA_NUM_CNT(len);
+       writel(reg, host->regbase + FMC_DATA_NUM);
+
+       reg = OP_CFG_FM_CS(priv->chipselect);
+       writel(reg, host->regbase + FMC_OP_CFG);
+
+       writel(0xff, host->regbase + FMC_INT_CLR);
+       reg = FMC_OP_CMD1_EN | FMC_OP_REG_OP_START | optype;
+       writel(reg, host->regbase + FMC_OP);
+
+       return wait_op_finish(host);
+}
+
+static int hisi_spi_nor_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf,
+               int len)
+{
+       struct hifmc_priv *priv = nor->priv;
+       struct hifmc_host *host = priv->host;
+       int ret;
+
+       ret = hisi_spi_nor_op_reg(nor, opcode, len, FMC_OP_READ_DATA_EN);
+       if (ret)
+               return ret;
+
+       memcpy_fromio(buf, host->iobase, len);
+       return 0;
+}
+
+static int hisi_spi_nor_write_reg(struct spi_nor *nor, u8 opcode,
+                               u8 *buf, int len)
+{
+       struct hifmc_priv *priv = nor->priv;
+       struct hifmc_host *host = priv->host;
+
+       if (len)
+               memcpy_toio(host->iobase, buf, len);
+
+       return hisi_spi_nor_op_reg(nor, opcode, len, FMC_OP_WRITE_DATA_EN);
+}
+
+static int hisi_spi_nor_dma_transfer(struct spi_nor *nor, loff_t start_off,
+               dma_addr_t dma_buf, size_t len, u8 op_type)
+{
+       struct hifmc_priv *priv = nor->priv;
+       struct hifmc_host *host = priv->host;
+       u8 if_type = 0;
+       u32 reg;
+
+       reg = readl(host->regbase + FMC_CFG);
+       reg &= ~(FMC_CFG_OP_MODE_MASK | SPI_NOR_ADDR_MODE_MASK);
+       reg |= FMC_CFG_OP_MODE_NORMAL;
+       reg |= (nor->addr_width == 4) ? SPI_NOR_ADDR_MODE_4BYTES
+               : SPI_NOR_ADDR_MODE_3BYTES;
+       writel(reg, host->regbase + FMC_CFG);
+
+       writel(start_off, host->regbase + FMC_ADDRL);
+       writel(dma_buf, host->regbase + FMC_DMA_SADDR_D0);
+       writel(FMC_DMA_LEN_SET(len), host->regbase + FMC_DMA_LEN);
+
+       reg = OP_CFG_FM_CS(priv->chipselect);
+       if_type = get_if_type(nor->flash_read);
+       reg |= OP_CFG_MEM_IF_TYPE(if_type);
+       if (op_type == FMC_OP_READ)
+               reg |= OP_CFG_DUMMY_NUM(nor->read_dummy >> 3);
+       writel(reg, host->regbase + FMC_OP_CFG);
+
+       writel(0xff, host->regbase + FMC_INT_CLR);
+       reg = OP_CTRL_RW_OP(op_type) | OP_CTRL_DMA_OP_READY;
+       reg |= (op_type == FMC_OP_READ)
+               ? OP_CTRL_RD_OPCODE(nor->read_opcode)
+               : OP_CTRL_WR_OPCODE(nor->program_opcode);
+       writel(reg, host->regbase + FMC_OP_DMA);
+
+       return wait_op_finish(host);
+}
+
+static ssize_t hisi_spi_nor_read(struct spi_nor *nor, loff_t from, size_t len,
+               u_char *read_buf)
+{
+       struct hifmc_priv *priv = nor->priv;
+       struct hifmc_host *host = priv->host;
+       size_t offset;
+       int ret;
+
+       for (offset = 0; offset < len; offset += HIFMC_DMA_MAX_LEN) {
+               size_t trans = min_t(size_t, HIFMC_DMA_MAX_LEN, len - offset);
+
+               ret = hisi_spi_nor_dma_transfer(nor,
+                       from + offset, host->dma_buffer, trans, FMC_OP_READ);
+               if (ret) {
+                       dev_warn(nor->dev, "DMA read timeout\n");
+                       return ret;
+               }
+               memcpy(read_buf + offset, host->buffer, trans);
+       }
+
+       return len;
+}
+
+static ssize_t hisi_spi_nor_write(struct spi_nor *nor, loff_t to,
+                       size_t len, const u_char *write_buf)
+{
+       struct hifmc_priv *priv = nor->priv;
+       struct hifmc_host *host = priv->host;
+       size_t offset;
+       int ret;
+
+       for (offset = 0; offset < len; offset += HIFMC_DMA_MAX_LEN) {
+               size_t trans = min_t(size_t, HIFMC_DMA_MAX_LEN, len - offset);
+
+               memcpy(host->buffer, write_buf + offset, trans);
+               ret = hisi_spi_nor_dma_transfer(nor,
+                       to + offset, host->dma_buffer, trans, FMC_OP_WRITE);
+               if (ret) {
+                       dev_warn(nor->dev, "DMA write timeout\n");
+                       return ret;
+               }
+       }
+
+       return len;
+}
+
+/**
+ * Get spi flash device information and register it as a mtd device.
+ */
+static int hisi_spi_nor_register(struct device_node *np,
+                               struct hifmc_host *host)
+{
+       struct device *dev = host->dev;
+       struct spi_nor *nor;
+       struct hifmc_priv *priv;
+       struct mtd_info *mtd;
+       int ret;
+
+       nor = devm_kzalloc(dev, sizeof(*nor), GFP_KERNEL);
+       if (!nor)
+               return -ENOMEM;
+
+       nor->dev = dev;
+       spi_nor_set_flash_node(nor, np);
+
+       priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       ret = of_property_read_u32(np, "reg", &priv->chipselect);
+       if (ret) {
+               dev_err(dev, "There's no reg property for %s\n",
+                       np->full_name);
+               return ret;
+       }
+
+       ret = of_property_read_u32(np, "spi-max-frequency",
+                       &priv->clkrate);
+       if (ret) {
+               dev_err(dev, "There's no spi-max-frequency property for %s\n",
+                       np->full_name);
+               return ret;
+       }
+       priv->host = host;
+       nor->priv = priv;
+
+       nor->prepare = hisi_spi_nor_prep;
+       nor->unprepare = hisi_spi_nor_unprep;
+       nor->read_reg = hisi_spi_nor_read_reg;
+       nor->write_reg = hisi_spi_nor_write_reg;
+       nor->read = hisi_spi_nor_read;
+       nor->write = hisi_spi_nor_write;
+       nor->erase = NULL;
+       ret = spi_nor_scan(nor, NULL, SPI_NOR_QUAD);
+       if (ret)
+               return ret;
+
+       mtd = &nor->mtd;
+       mtd->name = np->name;
+       ret = mtd_device_register(mtd, NULL, 0);
+       if (ret)
+               return ret;
+
+       host->nor[host->num_chip] = nor;
+       host->num_chip++;
+       return 0;
+}
+
+static void hisi_spi_nor_unregister_all(struct hifmc_host *host)
+{
+       int i;
+
+       for (i = 0; i < host->num_chip; i++)
+               mtd_device_unregister(&host->nor[i]->mtd);
+}
+
+static int hisi_spi_nor_register_all(struct hifmc_host *host)
+{
+       struct device *dev = host->dev;
+       struct device_node *np;
+       int ret;
+
+       for_each_available_child_of_node(dev->of_node, np) {
+               ret = hisi_spi_nor_register(np, host);
+               if (ret)
+                       goto fail;
+
+               if (host->num_chip == HIFMC_MAX_CHIP_NUM) {
+                       dev_warn(dev, "Flash device number exceeds the maximum chipselect number\n");
+                       break;
+               }
+       }
+
+       return 0;
+
+fail:
+       hisi_spi_nor_unregister_all(host);
+       return ret;
+}
+
+static int hisi_spi_nor_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct resource *res;
+       struct hifmc_host *host;
+       int ret;
+
+       host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL);
+       if (!host)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, host);
+       host->dev = dev;
+
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "control");
+       host->regbase = devm_ioremap_resource(dev, res);
+       if (IS_ERR(host->regbase))
+               return PTR_ERR(host->regbase);
+
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "memory");
+       host->iobase = devm_ioremap_resource(dev, res);
+       if (IS_ERR(host->iobase))
+               return PTR_ERR(host->iobase);
+
+       host->clk = devm_clk_get(dev, NULL);
+       if (IS_ERR(host->clk))
+               return PTR_ERR(host->clk);
+
+       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+       if (ret) {
+               dev_warn(dev, "Unable to set dma mask\n");
+               return ret;
+       }
+
+       host->buffer = dmam_alloc_coherent(dev, HIFMC_DMA_MAX_LEN,
+                       &host->dma_buffer, GFP_KERNEL);
+       if (!host->buffer)
+               return -ENOMEM;
+
+       mutex_init(&host->lock);
+       clk_prepare_enable(host->clk);
+       hisi_spi_nor_init(host);
+       ret = hisi_spi_nor_register_all(host);
+       if (ret)
+               mutex_destroy(&host->lock);
+
+       clk_disable_unprepare(host->clk);
+       return ret;
+}
+
+static int hisi_spi_nor_remove(struct platform_device *pdev)
+{
+       struct hifmc_host *host = platform_get_drvdata(pdev);
+
+       hisi_spi_nor_unregister_all(host);
+       mutex_destroy(&host->lock);
+       clk_disable_unprepare(host->clk);
+       return 0;
+}
+
+static const struct of_device_id hisi_spi_nor_dt_ids[] = {
+       { .compatible = "hisilicon,fmc-spi-nor"},
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, hisi_spi_nor_dt_ids);
+
+static struct platform_driver hisi_spi_nor_driver = {
+       .driver = {
+               .name   = "hisi-sfc",
+               .of_match_table = hisi_spi_nor_dt_ids,
+       },
+       .probe  = hisi_spi_nor_probe,
+       .remove = hisi_spi_nor_remove,
+};
+module_platform_driver(hisi_spi_nor_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("HiSilicon SPI Nor Flash Controller Driver");
index 8bed1a4..e661877 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/ioport.h>
 #include <linux/math64.h>
 #include <linux/module.h>
-#include <linux/mtd/mtd.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -243,8 +242,8 @@ static void mt8173_nor_set_addr(struct mt8173_nor *mt8173_nor, u32 addr)
        writeb(addr & 0xff, mt8173_nor->base + MTK_NOR_RADR3_REG);
 }
 
-static int mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length,
-                          size_t *retlen, u_char *buffer)
+static ssize_t mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length,
+                              u_char *buffer)
 {
        int i, ret;
        int addr = (int)from;
@@ -255,13 +254,13 @@ static int mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length,
        mt8173_nor_set_read_mode(mt8173_nor);
        mt8173_nor_set_addr(mt8173_nor, addr);
 
-       for (i = 0; i < length; i++, (*retlen)++) {
+       for (i = 0; i < length; i++) {
                ret = mt8173_nor_execute_cmd(mt8173_nor, MTK_NOR_PIO_READ_CMD);
                if (ret < 0)
                        return ret;
                buf[i] = readb(mt8173_nor->base + MTK_NOR_RDATA_REG);
        }
-       return 0;
+       return length;
 }
 
 static int mt8173_nor_write_single_byte(struct mt8173_nor *mt8173_nor,
@@ -297,36 +296,44 @@ static int mt8173_nor_write_buffer(struct mt8173_nor *mt8173_nor, int addr,
        return mt8173_nor_execute_cmd(mt8173_nor, MTK_NOR_WR_CMD);
 }
 
-static void mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len,
-                            size_t *retlen, const u_char *buf)
+static ssize_t mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len,
+                               const u_char *buf)
 {
        int ret;
        struct mt8173_nor *mt8173_nor = nor->priv;
+       size_t i;
 
        ret = mt8173_nor_write_buffer_enable(mt8173_nor);
-       if (ret < 0)
+       if (ret < 0) {
                dev_warn(mt8173_nor->dev, "write buffer enable failed!\n");
+               return ret;
+       }
 
-       while (len >= SFLASH_WRBUF_SIZE) {
+       for (i = 0; i + SFLASH_WRBUF_SIZE <= len; i += SFLASH_WRBUF_SIZE) {
                ret = mt8173_nor_write_buffer(mt8173_nor, to, buf);
-               if (ret < 0)
+               if (ret < 0) {
                        dev_err(mt8173_nor->dev, "write buffer failed!\n");
-               len -= SFLASH_WRBUF_SIZE;
+                       return ret;
+               }
                to += SFLASH_WRBUF_SIZE;
                buf += SFLASH_WRBUF_SIZE;
-               (*retlen) += SFLASH_WRBUF_SIZE;
        }
        ret = mt8173_nor_write_buffer_disable(mt8173_nor);
-       if (ret < 0)
+       if (ret < 0) {
                dev_warn(mt8173_nor->dev, "write buffer disable failed!\n");
+               return ret;
+       }
 
-       if (len) {
-               ret = mt8173_nor_write_single_byte(mt8173_nor, to, (int)len,
-                                                  (u8 *)buf);
-               if (ret < 0)
+       if (i < len) {
+               ret = mt8173_nor_write_single_byte(mt8173_nor, to,
+                                                  (int)(len - i), (u8 *)buf);
+               if (ret < 0) {
                        dev_err(mt8173_nor->dev, "write single byte failed!\n");
-               (*retlen) += len;
+                       return ret;
+               }
        }
+
+       return len;
 }
 
 static int mt8173_nor_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
index ae428cb..73a14f4 100644 (file)
@@ -172,8 +172,8 @@ static int nxp_spifi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
        return nxp_spifi_wait_for_cmd(spifi);
 }
 
-static int nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len,
-                         size_t *retlen, u_char *buf)
+static ssize_t nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len,
+                             u_char *buf)
 {
        struct nxp_spifi *spifi = nor->priv;
        int ret;
@@ -183,24 +183,23 @@ static int nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len,
                return ret;
 
        memcpy_fromio(buf, spifi->flash_base + from, len);
-       *retlen += len;
 
-       return 0;
+       return len;
 }
 
-static void nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len,
-                           size_t *retlen, const u_char *buf)
+static ssize_t nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len,
+                              const u_char *buf)
 {
        struct nxp_spifi *spifi = nor->priv;
        u32 cmd;
        int ret;
+       size_t i;
 
        ret = nxp_spifi_set_memory_mode_off(spifi);
        if (ret)
-               return;
+               return ret;
 
        writel(to, spifi->io_base + SPIFI_ADDR);
-       *retlen += len;
 
        cmd = SPIFI_CMD_DOUT |
              SPIFI_CMD_DATALEN(len) |
@@ -209,10 +208,14 @@ static void nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len,
              SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1);
        writel(cmd, spifi->io_base + SPIFI_CMD);
 
-       while (len--)
-               writeb(*buf++, spifi->io_base + SPIFI_DATA);
+       for (i = 0; i < len; i++)
+               writeb(buf[i], spifi->io_base + SPIFI_DATA);
+
+       ret = nxp_spifi_wait_for_cmd(spifi);
+       if (ret)
+               return ret;
 
-       nxp_spifi_wait_for_cmd(spifi);
+       return len;
 }
 
 static int nxp_spifi_erase(struct spi_nor *nor, loff_t offs)
index c52e455..d0fc165 100644 (file)
@@ -661,7 +661,7 @@ static int stm_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len)
        status_new = (status_old & ~mask & ~SR_TB) | val;
 
        /* Don't protect status register if we're fully unlocked */
-       if (lock_len == mtd->size)
+       if (lock_len == 0)
                status_new &= ~SR_SRWD;
 
        if (!use_top)
@@ -830,10 +830,26 @@ static const struct flash_info spi_nor_ids[] = {
        { "mb85rs1mt", INFO(0x047f27, 0, 128 * 1024, 1, SPI_NOR_NO_ERASE) },
 
        /* GigaDevice */
-       { "gd25q32", INFO(0xc84016, 0, 64 * 1024,  64, SECT_4K) },
-       { "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) },
-       { "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
-       { "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256, SECT_4K) },
+       {
+               "gd25q32", INFO(0xc84016, 0, 64 * 1024,  64,
+                       SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+                       SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB)
+       },
+       {
+               "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128,
+                       SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+                       SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB)
+       },
+       {
+               "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128,
+                       SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+                       SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB)
+       },
+       {
+               "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256,
+                       SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+                       SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB)
+       },
 
        /* Intel/Numonyx -- xxxs33b */
        { "160s33b",  INFO(0x898911, 0, 64 * 1024,  32, 0) },
@@ -871,6 +887,7 @@ static const struct flash_info spi_nor_ids[] = {
        { "n25q512a",    INFO(0x20bb20, 0, 64 * 1024, 1024, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
        { "n25q512ax3",  INFO(0x20ba20, 0, 64 * 1024, 1024, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
        { "n25q00",      INFO(0x20ba21, 0, 64 * 1024, 2048, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
+       { "n25q00a",     INFO(0x20bb21, 0, 64 * 1024, 2048, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
 
        /* PMC */
        { "pm25lv512",   INFO(0,        0, 32 * 1024,    2, SECT_4K_PMC) },
@@ -1031,8 +1048,25 @@ static int spi_nor_read(struct mtd_info *mtd, loff_t from, size_t len,
        if (ret)
                return ret;
 
-       ret = nor->read(nor, from, len, retlen, buf);
+       while (len) {
+               ret = nor->read(nor, from, len, buf);
+               if (ret == 0) {
+                       /* We shouldn't see 0-length reads */
+                       ret = -EIO;
+                       goto read_err;
+               }
+               if (ret < 0)
+                       goto read_err;
+
+               WARN_ON(ret > len);
+               *retlen += ret;
+               buf += ret;
+               from += ret;
+               len -= ret;
+       }
+       ret = 0;
 
+read_err:
        spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_READ);
        return ret;
 }
@@ -1060,10 +1094,14 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
                nor->program_opcode = SPINOR_OP_BP;
 
                /* write one byte. */
-               nor->write(nor, to, 1, retlen, buf);
+               ret = nor->write(nor, to, 1, buf);
+               if (ret < 0)
+                       goto sst_write_err;
+               WARN(ret != 1, "While writing 1 byte written %i bytes\n",
+                    (int)ret);
                ret = spi_nor_wait_till_ready(nor);
                if (ret)
-                       goto time_out;
+                       goto sst_write_err;
        }
        to += actual;
 
@@ -1072,10 +1110,14 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
                nor->program_opcode = SPINOR_OP_AAI_WP;
 
                /* write two bytes. */
-               nor->write(nor, to, 2, retlen, buf + actual);
+               ret = nor->write(nor, to, 2, buf + actual);
+               if (ret < 0)
+                       goto sst_write_err;
+               WARN(ret != 2, "While writing 2 bytes written %i bytes\n",
+                    (int)ret);
                ret = spi_nor_wait_till_ready(nor);
                if (ret)
-                       goto time_out;
+                       goto sst_write_err;
                to += 2;
                nor->sst_write_second = true;
        }
@@ -1084,21 +1126,26 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
        write_disable(nor);
        ret = spi_nor_wait_till_ready(nor);
        if (ret)
-               goto time_out;
+               goto sst_write_err;
 
        /* Write out trailing byte if it exists. */
        if (actual != len) {
                write_enable(nor);
 
                nor->program_opcode = SPINOR_OP_BP;
-               nor->write(nor, to, 1, retlen, buf + actual);
-
+               ret = nor->write(nor, to, 1, buf + actual);
+               if (ret < 0)
+                       goto sst_write_err;
+               WARN(ret != 1, "While writing 1 byte written %i bytes\n",
+                    (int)ret);
                ret = spi_nor_wait_till_ready(nor);
                if (ret)
-                       goto time_out;
+                       goto sst_write_err;
                write_disable(nor);
+               actual += 1;
        }
-time_out:
+sst_write_err:
+       *retlen += actual;
        spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE);
        return ret;
 }
@@ -1112,8 +1159,8 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
        size_t *retlen, const u_char *buf)
 {
        struct spi_nor *nor = mtd_to_spi_nor(mtd);
-       u32 page_offset, page_size, i;
-       int ret;
+       size_t page_offset, page_remain, i;
+       ssize_t ret;
 
        dev_dbg(nor->dev, "to 0x%08x, len %zd\n", (u32)to, len);
 
@@ -1121,35 +1168,37 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
        if (ret)
                return ret;
 
-       write_enable(nor);
-
-       page_offset = to & (nor->page_size - 1);
+       for (i = 0; i < len; ) {
+               ssize_t written;
 
-       /* do all the bytes fit onto one page? */
-       if (page_offset + len <= nor->page_size) {
-               nor->write(nor, to, len, retlen, buf);
-       } else {
+               page_offset = (to + i) & (nor->page_size - 1);
+               WARN_ONCE(page_offset,
+                         "Writing at offset %zu into a NOR page. Writing partial pages may decrease reliability and increase wear of NOR flash.",
+                         page_offset);
                /* the size of data remaining on the first page */
-               page_size = nor->page_size - page_offset;
-               nor->write(nor, to, page_size, retlen, buf);
-
-               /* write everything in nor->page_size chunks */
-               for (i = page_size; i < len; i += page_size) {
-                       page_size = len - i;
-                       if (page_size > nor->page_size)
-                               page_size = nor->page_size;
+               page_remain = min_t(size_t,
+                                   nor->page_size - page_offset, len - i);
 
-                       ret = spi_nor_wait_till_ready(nor);
-                       if (ret)
-                               goto write_err;
-
-                       write_enable(nor);
+               write_enable(nor);
+               ret = nor->write(nor, to + i, page_remain, buf + i);
+               if (ret < 0)
+                       goto write_err;
+               written = ret;
 
-                       nor->write(nor, to + i, page_size, retlen, buf + i);
+               ret = spi_nor_wait_till_ready(nor);
+               if (ret)
+                       goto write_err;
+               *retlen += written;
+               i += written;
+               if (written != page_remain) {
+                       dev_err(nor->dev,
+                               "While writing %zu bytes written %zd bytes\n",
+                               page_remain, written);
+                       ret = -EIO;
+                       goto write_err;
                }
        }
 
-       ret = spi_nor_wait_till_ready(nor);
 write_err:
        spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE);
        return ret;
index daf82ba..41b13d1 100644 (file)
@@ -380,8 +380,7 @@ static int ssfdcr_readsect(struct mtd_blktrans_dev *dev,
                " block_addr=%d\n", logic_sect_no, sectors_per_block, offset,
                block_address);
 
-       if (block_address >= ssfdc->map_len)
-               BUG();
+       BUG_ON(block_address >= ssfdc->map_len);
 
        block_address = ssfdc->logic_block_map[block_address];
 
index 09a4cca..f26dec8 100644 (file)
@@ -290,7 +290,7 @@ static int overwrite_test(void)
 
        while (opno < max_overwrite) {
 
-               err = rewrite_page(0);
+               err = write_page(0);
                if (err)
                        break;
 
index e708e36..6453148 100644 (file)
@@ -1251,7 +1251,7 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        struct alx_priv *alx;
        struct alx_hw *hw;
        bool phy_configured;
-       int bars, err;
+       int err;
 
        err = pci_enable_device_mem(pdev);
        if (err)
@@ -1271,11 +1271,10 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                }
        }
 
-       bars = pci_select_bars(pdev, IORESOURCE_MEM);
-       err = pci_request_selected_regions(pdev, bars, alx_drv_name);
+       err = pci_request_mem_regions(pdev, alx_drv_name);
        if (err) {
                dev_err(&pdev->dev,
-                       "pci_request_selected_regions failed(bars:%d)\n", bars);
+                       "pci_request_mem_regions failed\n");
                goto out_pci_disable;
        }
 
@@ -1401,7 +1400,7 @@ out_unmap:
 out_free_netdev:
        free_netdev(netdev);
 out_pci_release:
-       pci_release_selected_regions(pdev, bars);
+       pci_release_mem_regions(pdev);
 out_pci_disable:
        pci_disable_device(pdev);
        return err;
@@ -1420,8 +1419,7 @@ static void alx_remove(struct pci_dev *pdev)
 
        unregister_netdev(alx->dev);
        iounmap(hw->hw_addr);
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 
        pci_disable_pcie_error_reporting(pdev);
        pci_disable_device(pdev);
index 41f32c0..02f4439 100644 (file)
@@ -7330,8 +7330,7 @@ err_flashmap:
 err_ioremap:
        free_netdev(netdev);
 err_alloc_etherdev:
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
        pci_disable_device(pdev);
@@ -7398,8 +7397,7 @@ static void e1000_remove(struct pci_dev *pdev)
        if ((adapter->hw.flash_address) &&
            (adapter->hw.mac.type < e1000_pch_spt))
                iounmap(adapter->hw.flash_address);
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 
        free_netdev(netdev);
 
index b8245c7..774a565 100644 (file)
@@ -1963,10 +1963,7 @@ static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                goto err_dma;
        }
 
-       err = pci_request_selected_regions(pdev,
-                                          pci_select_bars(pdev,
-                                                          IORESOURCE_MEM),
-                                          fm10k_driver_name);
+       err = pci_request_mem_regions(pdev, fm10k_driver_name);
        if (err) {
                dev_err(&pdev->dev,
                        "pci_request_selected_regions failed: %d\n", err);
@@ -2070,8 +2067,7 @@ err_sw_init:
 err_ioremap:
        free_netdev(netdev);
 err_alloc_netdev:
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
        pci_disable_device(pdev);
@@ -2119,8 +2115,7 @@ static void fm10k_remove(struct pci_dev *pdev)
 
        free_netdev(netdev);
 
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 
        pci_disable_pcie_error_reporting(pdev);
 
index 339d99b..81c99e1 100644 (file)
@@ -10710,8 +10710,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        }
 
        /* set up pci connections */
-       err = pci_request_selected_regions(pdev, pci_select_bars(pdev,
-                                          IORESOURCE_MEM), i40e_driver_name);
+       err = pci_request_mem_regions(pdev, i40e_driver_name);
        if (err) {
                dev_info(&pdev->dev,
                         "pci_request_selected_regions failed %d\n", err);
@@ -11208,8 +11207,7 @@ err_ioremap:
        kfree(pf);
 err_pf_alloc:
        pci_disable_pcie_error_reporting(pdev);
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
        pci_disable_device(pdev);
@@ -11320,8 +11318,7 @@ static void i40e_remove(struct pci_dev *pdev)
 
        iounmap(hw->hw_addr);
        kfree(pf);
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 
        pci_disable_pcie_error_reporting(pdev);
        pci_disable_device(pdev);
index 9bcba42..942a89f 100644 (file)
@@ -2324,9 +2324,7 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                }
        }
 
-       err = pci_request_selected_regions(pdev, pci_select_bars(pdev,
-                                          IORESOURCE_MEM),
-                                          igb_driver_name);
+       err = pci_request_mem_regions(pdev, igb_driver_name);
        if (err)
                goto err_pci_reg;
 
@@ -2750,8 +2748,7 @@ err_sw_init:
 err_ioremap:
        free_netdev(netdev);
 err_alloc_etherdev:
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
        pci_disable_device(pdev);
@@ -2916,8 +2913,7 @@ static void igb_remove(struct pci_dev *pdev)
        pci_iounmap(pdev, adapter->io_addr);
        if (hw->flash_address)
                iounmap(hw->flash_address);
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 
        kfree(adapter->shadow_vfta);
        free_netdev(netdev);
index 7871f53..5418c69 100644 (file)
@@ -9353,8 +9353,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                pci_using_dac = 0;
        }
 
-       err = pci_request_selected_regions(pdev, pci_select_bars(pdev,
-                                          IORESOURCE_MEM), ixgbe_driver_name);
+       err = pci_request_mem_regions(pdev, ixgbe_driver_name);
        if (err) {
                dev_err(&pdev->dev,
                        "pci_request_selected_regions failed 0x%x\n", err);
@@ -9740,8 +9739,7 @@ err_ioremap:
        disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state);
        free_netdev(netdev);
 err_alloc_etherdev:
-       pci_release_selected_regions(pdev,
-                                    pci_select_bars(pdev, IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
        if (!adapter || disable_dev)
@@ -9808,8 +9806,7 @@ static void ixgbe_remove(struct pci_dev *pdev)
 
 #endif
        iounmap(adapter->io_addr);
-       pci_release_selected_regions(pdev, pci_select_bars(pdev,
-                                    IORESOURCE_MEM));
+       pci_release_mem_regions(pdev);
 
        e_dev_info("complete\n");
 
index 4cb9b15..d7c33f9 100644 (file)
@@ -1661,14 +1661,9 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 
 static void nvme_dev_unmap(struct nvme_dev *dev)
 {
-       struct pci_dev *pdev = to_pci_dev(dev->dev);
-       int bars;
-
        if (dev->bar)
                iounmap(dev->bar);
-
-       bars = pci_select_bars(pdev, IORESOURCE_MEM);
-       pci_release_selected_regions(pdev, bars);
+       pci_release_mem_regions(to_pci_dev(dev->dev));
 }
 
 static void nvme_pci_disable(struct nvme_dev *dev)
@@ -1897,13 +1892,9 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 
 static int nvme_dev_map(struct nvme_dev *dev)
 {
-       int bars;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-       bars = pci_select_bars(pdev, IORESOURCE_MEM);
-       if (!bars)
-               return -ENODEV;
-       if (pci_request_selected_regions(pdev, bars, "nvme"))
+       if (pci_request_mem_regions(pdev, "nvme"))
                return -ENODEV;
 
        dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
@@ -1912,7 +1903,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
 
        return 0;
   release:
-       pci_release_selected_regions(pdev, bars);
+       pci_release_mem_regions(pdev);
        return -ENODEV;
 }
 
index 56389be..67f9916 100644 (file)
@@ -25,7 +25,7 @@ config PCI_MSI
           If you don't know what to do here, say Y.
 
 config PCI_MSI_IRQ_DOMAIN
-       bool
+       def_bool ARM || ARM64 || X86
        depends on PCI_MSI
        select GENERIC_MSI_IRQ_DOMAIN
 
index dd7cdbe..c288e5a 100644 (file)
@@ -91,6 +91,35 @@ void pci_bus_remove_resources(struct pci_bus *bus)
        }
 }
 
+int devm_request_pci_bus_resources(struct device *dev,
+                                  struct list_head *resources)
+{
+       struct resource_entry *win;
+       struct resource *parent, *res;
+       int err;
+
+       resource_list_for_each_entry(win, resources) {
+               res = win->res;
+               switch (resource_type(res)) {
+               case IORESOURCE_IO:
+                       parent = &ioport_resource;
+                       break;
+               case IORESOURCE_MEM:
+                       parent = &iomem_resource;
+                       break;
+               default:
+                       continue;
+               }
+
+               err = devm_request_resource(dev, parent, res);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(devm_request_pci_bus_resources);
+
 static struct pci_bus_region pci_32_bit = {0, 0xffffffffULL};
 #ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
 static struct pci_bus_region pci_64_bit = {0,
@@ -291,6 +320,7 @@ void pci_bus_add_device(struct pci_dev *dev)
        pci_fixup_device(pci_fixup_final, dev);
        pci_create_sysfs_dev_files(dev);
        pci_proc_attach_device(dev);
+       pci_bridge_d3_device_changed(dev);
 
        dev->match_driver = true;
        retval = device_attach(&dev->dev);
@@ -397,4 +427,3 @@ void pci_bus_put(struct pci_bus *bus)
                put_device(&bus->dev);
 }
 EXPORT_SYMBOL(pci_bus_put);
-
index f9832ad..66e0d71 100644 (file)
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/slab.h>
 
-#include "ecam.h"
-
 /*
  * On 64-bit systems, we do a single ioremap for the whole config space
  * since we have enough virtual address range available.  On 32-bit, we
@@ -52,6 +51,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
        if (!cfg)
                return ERR_PTR(-ENOMEM);
 
+       cfg->parent = dev;
        cfg->ops = ops;
        cfg->busr.start = busr->start;
        cfg->busr.end = busr->end;
@@ -95,7 +95,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
        }
 
        if (ops->init) {
-               err = ops->init(dev, cfg);
+               err = ops->init(cfg);
                if (err)
                        goto err_exit;
        }
diff --git a/drivers/pci/ecam.h b/drivers/pci/ecam.h
deleted file mode 100644 (file)
index 9878beb..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2016 Broadcom
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation (the "GPL").
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 (GPLv2) for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 (GPLv2) along with this source code.
- */
-#ifndef DRIVERS_PCI_ECAM_H
-#define DRIVERS_PCI_ECAM_H
-
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-
-/*
- * struct to hold pci ops and bus shift of the config window
- * for a PCI controller.
- */
-struct pci_config_window;
-struct pci_ecam_ops {
-       unsigned int                    bus_shift;
-       struct pci_ops                  pci_ops;
-       int                             (*init)(struct device *,
-                                               struct pci_config_window *);
-};
-
-/*
- * struct to hold the mappings of a config space window. This
- * is expected to be used as sysdata for PCI controllers that
- * use ECAM.
- */
-struct pci_config_window {
-       struct resource                 res;
-       struct resource                 busr;
-       void                            *priv;
-       struct pci_ecam_ops             *ops;
-       union {
-               void __iomem            *win;   /* 64-bit single mapping */
-               void __iomem            **winp; /* 32-bit per-bus mapping */
-       };
-};
-
-/* create and free pci_config_window */
-struct pci_config_window *pci_ecam_create(struct device *dev,
-               struct resource *cfgres, struct resource *busr,
-               struct pci_ecam_ops *ops);
-void pci_ecam_free(struct pci_config_window *cfg);
-
-/* map_bus when ->sysdata is an instance of pci_config_window */
-void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn,
-                              int where);
-/* default ECAM ops */
-extern struct pci_ecam_ops pci_generic_ecam_ops;
-
-#ifdef CONFIG_PCI_HOST_GENERIC
-/* for DT-based PCI controllers that support ECAM */
-int pci_host_common_probe(struct platform_device *pdev,
-                         struct pci_ecam_ops *ops);
-#endif
-#endif
index 5d2374e..9b485d8 100644 (file)
@@ -3,8 +3,9 @@ menu "PCI host controller drivers"
 
 config PCI_DRA7XX
        bool "TI DRA7xx PCIe controller"
-       select PCIE_DW
        depends on OF && HAS_IOMEM && TI_PIPE3
+       depends on PCI_MSI_IRQ_DOMAIN
+       select PCIE_DW
        help
         Enables support for the PCIe controller in the DRA7xx SoC.  There
         are two instances of PCIe controller in DRA7xx.  This controller can
@@ -16,11 +17,20 @@ config PCI_MVEBU
        depends on ARM
        depends on OF
 
+config PCI_AARDVARK
+       bool "Aardvark PCIe controller"
+       depends on ARCH_MVEBU && ARM64
+       depends on OF
+       depends on PCI_MSI_IRQ_DOMAIN
+       help
+        Add support for Aardvark 64bit PCIe Host Controller. This
+        controller is part of the South Bridge of the Marvel Armada
+        3700 SoC.
 
 config PCIE_XILINX_NWL
        bool "NWL PCIe Core"
        depends on ARCH_ZYNQMP
-       select PCI_MSI_IRQ_DOMAIN if PCI_MSI
+       depends on PCI_MSI_IRQ_DOMAIN
        help
         Say 'Y' here if you want kernel support for Xilinx
         NWL PCIe controller. The controller can act as Root Port
@@ -29,6 +39,7 @@ config PCIE_XILINX_NWL
 
 config PCIE_DW_PLAT
        bool "Platform bus based DesignWare PCIe Controller"
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW
        ---help---
         This selects the DesignWare PCIe controller support. Select this if
@@ -40,16 +51,19 @@ config PCIE_DW_PLAT
 
 config PCIE_DW
        bool
+       depends on PCI_MSI_IRQ_DOMAIN
 
 config PCI_EXYNOS
        bool "Samsung Exynos PCIe controller"
        depends on SOC_EXYNOS5440
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIEPORTBUS
        select PCIE_DW
 
 config PCI_IMX6
        bool "Freescale i.MX6 PCIe controller"
        depends on SOC_IMX6Q
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIEPORTBUS
        select PCIE_DW
 
@@ -72,8 +86,7 @@ config PCI_RCAR_GEN2
 config PCIE_RCAR
        bool "Renesas R-Car PCIe controller"
        depends on ARCH_RENESAS || (ARM && COMPILE_TEST)
-       select PCI_MSI
-       select PCI_MSI_IRQ_DOMAIN
+       depends on PCI_MSI_IRQ_DOMAIN
        help
          Say Y here if you want PCIe controller support on R-Car SoCs.
 
@@ -85,6 +98,7 @@ config PCI_HOST_GENERIC
        bool "Generic PCI host controller"
        depends on (ARM || ARM64) && OF
        select PCI_HOST_COMMON
+       select IRQ_DOMAIN
        help
          Say Y here if you want to support a simple generic PCI host
          controller, such as the one emulated by kvmtool.
@@ -92,6 +106,7 @@ config PCI_HOST_GENERIC
 config PCIE_SPEAR13XX
        bool "STMicroelectronics SPEAr PCIe controller"
        depends on ARCH_SPEAR13XX
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIEPORTBUS
        select PCIE_DW
        help
@@ -100,6 +115,7 @@ config PCIE_SPEAR13XX
 config PCI_KEYSTONE
        bool "TI Keystone PCIe controller"
        depends on ARCH_KEYSTONE
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW
        select PCIEPORTBUS
        help
@@ -120,7 +136,6 @@ config PCI_XGENE
        depends on ARCH_XGENE
        depends on OF
        select PCIEPORTBUS
-       select PCI_MSI_IRQ_DOMAIN if PCI_MSI
        help
          Say Y here if you want internal PCI support on APM X-Gene SoC.
          There are 5 internal PCIe ports available. Each port is GEN3 capable
@@ -128,7 +143,8 @@ config PCI_XGENE
 
 config PCI_XGENE_MSI
        bool "X-Gene v1 PCIe MSI feature"
-       depends on PCI_XGENE && PCI_MSI
+       depends on PCI_XGENE
+       depends on PCI_MSI_IRQ_DOMAIN
        default y
        help
          Say Y here if you want PCIe MSI support for the APM X-Gene v1 SoC.
@@ -137,6 +153,7 @@ config PCI_XGENE_MSI
 config PCI_LAYERSCAPE
        bool "Freescale Layerscape PCIe controller"
        depends on OF && (ARM || ARCH_LAYERSCAPE)
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW
        select MFD_SYSCON
        help
@@ -177,8 +194,7 @@ config PCIE_IPROC_BCMA
 config PCIE_IPROC_MSI
        bool "Broadcom iProc PCIe MSI support"
        depends on PCIE_IPROC_PLATFORM || PCIE_IPROC_BCMA
-       depends on PCI_MSI
-       select PCI_MSI_IRQ_DOMAIN
+       depends on PCI_MSI_IRQ_DOMAIN
        default ARCH_BCM_IPROC
        help
          Say Y here if you want to enable MSI support for Broadcom's iProc
@@ -195,8 +211,8 @@ config PCIE_ALTERA
 
 config PCIE_ALTERA_MSI
        bool "Altera PCIe MSI feature"
-       depends on PCIE_ALTERA && PCI_MSI
-       select PCI_MSI_IRQ_DOMAIN
+       depends on PCIE_ALTERA
+       depends on PCI_MSI_IRQ_DOMAIN
        help
          Say Y here if you want PCIe MSI support for the Altera FPGA.
          This MSI driver supports Altera MSI to GIC controller IP.
@@ -204,6 +220,7 @@ config PCIE_ALTERA_MSI
 config PCI_HISI
        depends on OF && ARM64
        bool "HiSilicon Hip05 and Hip06 SoCs PCIe controllers"
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIEPORTBUS
        select PCIE_DW
        help
@@ -213,6 +230,7 @@ config PCI_HISI
 config PCIE_QCOM
        bool "Qualcomm PCIe controller"
        depends on ARCH_QCOM && OF
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW
        select PCIEPORTBUS
        help
@@ -237,6 +255,7 @@ config PCI_HOST_THUNDER_ECAM
 config PCIE_ARMADA_8K
        bool "Marvell Armada-8K PCIe controller"
        depends on ARCH_MVEBU
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW
        select PCIEPORTBUS
        help
@@ -245,4 +264,14 @@ config PCIE_ARMADA_8K
          Designware hardware and therefore the driver re-uses the
          Designware core functions to implement the driver.
 
+config PCIE_ARTPEC6
+       bool "Axis ARTPEC-6 PCIe controller"
+       depends on MACH_ARTPEC6
+       depends on PCI_MSI_IRQ_DOMAIN
+       select PCIE_DW
+       select PCIEPORTBUS
+       help
+         Say Y here to enable PCIe controller support on Axis ARTPEC-6
+         SoCs.  This PCIe controller uses the DesignWare core.
+
 endmenu
index 9c8698e..8843410 100644 (file)
@@ -5,6 +5,7 @@ obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o
 obj-$(CONFIG_PCI_IMX6) += pci-imx6.o
 obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
+obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
 obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o
 obj-$(CONFIG_PCIE_RCAR) += pcie-rcar.o
@@ -29,3 +30,4 @@ obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o
 obj-$(CONFIG_PCI_HOST_THUNDER_ECAM) += pci-thunder-ecam.o
 obj-$(CONFIG_PCI_HOST_THUNDER_PEM) += pci-thunder-pem.o
 obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o
+obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o
diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
new file mode 100644 (file)
index 0000000..ef9893f
--- /dev/null
@@ -0,0 +1,1001 @@
+/*
+ * Driver for the Aardvark PCIe controller, used on Marvell Armada
+ * 3700.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Author: Hezi Shahmoon <hezi.shahmoon@marvell.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+
+/* PCIe core registers */
+#define PCIE_CORE_CMD_STATUS_REG                               0x4
+#define     PCIE_CORE_CMD_IO_ACCESS_EN                         BIT(0)
+#define     PCIE_CORE_CMD_MEM_ACCESS_EN                                BIT(1)
+#define     PCIE_CORE_CMD_MEM_IO_REQ_EN                                BIT(2)
+#define PCIE_CORE_DEV_CTRL_STATS_REG                           0xc8
+#define     PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE       (0 << 4)
+#define     PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT      5
+#define     PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE             (0 << 11)
+#define     PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT     12
+#define PCIE_CORE_LINK_CTRL_STAT_REG                           0xd0
+#define     PCIE_CORE_LINK_L0S_ENTRY                           BIT(0)
+#define     PCIE_CORE_LINK_TRAINING                            BIT(5)
+#define     PCIE_CORE_LINK_WIDTH_SHIFT                         20
+#define PCIE_CORE_ERR_CAPCTL_REG                               0x118
+#define     PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX                   BIT(5)
+#define     PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN                        BIT(6)
+#define     PCIE_CORE_ERR_CAPCTL_ECRC_CHCK                     BIT(7)
+#define     PCIE_CORE_ERR_CAPCTL_ECRC_CHCK_RCV                 BIT(8)
+
+/* PIO registers base address and register offsets */
+#define PIO_BASE_ADDR                          0x4000
+#define PIO_CTRL                               (PIO_BASE_ADDR + 0x0)
+#define   PIO_CTRL_TYPE_MASK                   GENMASK(3, 0)
+#define   PIO_CTRL_ADDR_WIN_DISABLE            BIT(24)
+#define PIO_STAT                               (PIO_BASE_ADDR + 0x4)
+#define   PIO_COMPLETION_STATUS_SHIFT          7
+#define   PIO_COMPLETION_STATUS_MASK           GENMASK(9, 7)
+#define   PIO_COMPLETION_STATUS_OK             0
+#define   PIO_COMPLETION_STATUS_UR             1
+#define   PIO_COMPLETION_STATUS_CRS            2
+#define   PIO_COMPLETION_STATUS_CA             4
+#define   PIO_NON_POSTED_REQ                   BIT(0)
+#define PIO_ADDR_LS                            (PIO_BASE_ADDR + 0x8)
+#define PIO_ADDR_MS                            (PIO_BASE_ADDR + 0xc)
+#define PIO_WR_DATA                            (PIO_BASE_ADDR + 0x10)
+#define PIO_WR_DATA_STRB                       (PIO_BASE_ADDR + 0x14)
+#define PIO_RD_DATA                            (PIO_BASE_ADDR + 0x18)
+#define PIO_START                              (PIO_BASE_ADDR + 0x1c)
+#define PIO_ISR                                        (PIO_BASE_ADDR + 0x20)
+#define PIO_ISRM                               (PIO_BASE_ADDR + 0x24)
+
+/* Aardvark Control registers */
+#define CONTROL_BASE_ADDR                      0x4800
+#define PCIE_CORE_CTRL0_REG                    (CONTROL_BASE_ADDR + 0x0)
+#define     PCIE_GEN_SEL_MSK                   0x3
+#define     PCIE_GEN_SEL_SHIFT                 0x0
+#define     SPEED_GEN_1                                0
+#define     SPEED_GEN_2                                1
+#define     SPEED_GEN_3                                2
+#define     IS_RC_MSK                          1
+#define     IS_RC_SHIFT                                2
+#define     LANE_CNT_MSK                       0x18
+#define     LANE_CNT_SHIFT                     0x3
+#define     LANE_COUNT_1                       (0 << LANE_CNT_SHIFT)
+#define     LANE_COUNT_2                       (1 << LANE_CNT_SHIFT)
+#define     LANE_COUNT_4                       (2 << LANE_CNT_SHIFT)
+#define     LANE_COUNT_8                       (3 << LANE_CNT_SHIFT)
+#define     LINK_TRAINING_EN                   BIT(6)
+#define     LEGACY_INTA                                BIT(28)
+#define     LEGACY_INTB                                BIT(29)
+#define     LEGACY_INTC                                BIT(30)
+#define     LEGACY_INTD                                BIT(31)
+#define PCIE_CORE_CTRL1_REG                    (CONTROL_BASE_ADDR + 0x4)
+#define     HOT_RESET_GEN                      BIT(0)
+#define PCIE_CORE_CTRL2_REG                    (CONTROL_BASE_ADDR + 0x8)
+#define     PCIE_CORE_CTRL2_RESERVED           0x7
+#define     PCIE_CORE_CTRL2_TD_ENABLE          BIT(4)
+#define     PCIE_CORE_CTRL2_STRICT_ORDER_ENABLE        BIT(5)
+#define     PCIE_CORE_CTRL2_OB_WIN_ENABLE      BIT(6)
+#define     PCIE_CORE_CTRL2_MSI_ENABLE         BIT(10)
+#define PCIE_ISR0_REG                          (CONTROL_BASE_ADDR + 0x40)
+#define PCIE_ISR0_MASK_REG                     (CONTROL_BASE_ADDR + 0x44)
+#define     PCIE_ISR0_MSI_INT_PENDING          BIT(24)
+#define     PCIE_ISR0_INTX_ASSERT(val)         BIT(16 + (val))
+#define     PCIE_ISR0_INTX_DEASSERT(val)       BIT(20 + (val))
+#define            PCIE_ISR0_ALL_MASK                  GENMASK(26, 0)
+#define PCIE_ISR1_REG                          (CONTROL_BASE_ADDR + 0x48)
+#define PCIE_ISR1_MASK_REG                     (CONTROL_BASE_ADDR + 0x4C)
+#define     PCIE_ISR1_POWER_STATE_CHANGE       BIT(4)
+#define     PCIE_ISR1_FLUSH                    BIT(5)
+#define     PCIE_ISR1_ALL_MASK                 GENMASK(5, 4)
+#define PCIE_MSI_ADDR_LOW_REG                  (CONTROL_BASE_ADDR + 0x50)
+#define PCIE_MSI_ADDR_HIGH_REG                 (CONTROL_BASE_ADDR + 0x54)
+#define PCIE_MSI_STATUS_REG                    (CONTROL_BASE_ADDR + 0x58)
+#define PCIE_MSI_MASK_REG                      (CONTROL_BASE_ADDR + 0x5C)
+#define PCIE_MSI_PAYLOAD_REG                   (CONTROL_BASE_ADDR + 0x9C)
+
+/* PCIe window configuration */
+#define OB_WIN_BASE_ADDR                       0x4c00
+#define OB_WIN_BLOCK_SIZE                      0x20
+#define OB_WIN_REG_ADDR(win, offset)           (OB_WIN_BASE_ADDR + \
+                                                OB_WIN_BLOCK_SIZE * (win) + \
+                                                (offset))
+#define OB_WIN_MATCH_LS(win)                   OB_WIN_REG_ADDR(win, 0x00)
+#define OB_WIN_MATCH_MS(win)                   OB_WIN_REG_ADDR(win, 0x04)
+#define OB_WIN_REMAP_LS(win)                   OB_WIN_REG_ADDR(win, 0x08)
+#define OB_WIN_REMAP_MS(win)                   OB_WIN_REG_ADDR(win, 0x0c)
+#define OB_WIN_MASK_LS(win)                    OB_WIN_REG_ADDR(win, 0x10)
+#define OB_WIN_MASK_MS(win)                    OB_WIN_REG_ADDR(win, 0x14)
+#define OB_WIN_ACTIONS(win)                    OB_WIN_REG_ADDR(win, 0x18)
+
+/* PCIe window types */
+#define OB_PCIE_MEM                            0x0
+#define OB_PCIE_IO                             0x4
+
+/* LMI registers base address and register offsets */
+#define LMI_BASE_ADDR                          0x6000
+#define CFG_REG                                        (LMI_BASE_ADDR + 0x0)
+#define     LTSSM_SHIFT                                24
+#define     LTSSM_MASK                         0x3f
+#define     LTSSM_L0                           0x10
+#define     RC_BAR_CONFIG                      0x300
+
+/* PCIe core controller registers */
+#define CTRL_CORE_BASE_ADDR                    0x18000
+#define CTRL_CONFIG_REG                                (CTRL_CORE_BASE_ADDR + 0x0)
+#define     CTRL_MODE_SHIFT                    0x0
+#define     CTRL_MODE_MASK                     0x1
+#define     PCIE_CORE_MODE_DIRECT              0x0
+#define     PCIE_CORE_MODE_COMMAND             0x1
+
+/* PCIe Central Interrupts Registers */
+#define CENTRAL_INT_BASE_ADDR                  0x1b000
+#define HOST_CTRL_INT_STATUS_REG               (CENTRAL_INT_BASE_ADDR + 0x0)
+#define HOST_CTRL_INT_MASK_REG                 (CENTRAL_INT_BASE_ADDR + 0x4)
+#define     PCIE_IRQ_CMDQ_INT                  BIT(0)
+#define     PCIE_IRQ_MSI_STATUS_INT            BIT(1)
+#define     PCIE_IRQ_CMD_SENT_DONE             BIT(3)
+#define     PCIE_IRQ_DMA_INT                   BIT(4)
+#define     PCIE_IRQ_IB_DXFERDONE              BIT(5)
+#define     PCIE_IRQ_OB_DXFERDONE              BIT(6)
+#define     PCIE_IRQ_OB_RXFERDONE              BIT(7)
+#define     PCIE_IRQ_COMPQ_INT                 BIT(12)
+#define     PCIE_IRQ_DIR_RD_DDR_DET            BIT(13)
+#define     PCIE_IRQ_DIR_WR_DDR_DET            BIT(14)
+#define     PCIE_IRQ_CORE_INT                  BIT(16)
+#define     PCIE_IRQ_CORE_INT_PIO              BIT(17)
+#define     PCIE_IRQ_DPMU_INT                  BIT(18)
+#define     PCIE_IRQ_PCIE_MIS_INT              BIT(19)
+#define     PCIE_IRQ_MSI_INT1_DET              BIT(20)
+#define     PCIE_IRQ_MSI_INT2_DET              BIT(21)
+#define     PCIE_IRQ_RC_DBELL_DET              BIT(22)
+#define     PCIE_IRQ_EP_STATUS                 BIT(23)
+#define     PCIE_IRQ_ALL_MASK                  0xfff0fb
+#define     PCIE_IRQ_ENABLE_INTS_MASK          PCIE_IRQ_CORE_INT
+
+/* Transaction types */
+#define PCIE_CONFIG_RD_TYPE0                   0x8
+#define PCIE_CONFIG_RD_TYPE1                   0x9
+#define PCIE_CONFIG_WR_TYPE0                   0xa
+#define PCIE_CONFIG_WR_TYPE1                   0xb
+
+/* PCI_BDF shifts 8bit, so we need extra 4bit shift */
+#define PCIE_BDF(dev)                          (dev << 4)
+#define PCIE_CONF_BUS(bus)                     (((bus) & 0xff) << 20)
+#define PCIE_CONF_DEV(dev)                     (((dev) & 0x1f) << 15)
+#define PCIE_CONF_FUNC(fun)                    (((fun) & 0x7)  << 12)
+#define PCIE_CONF_REG(reg)                     ((reg) & 0xffc)
+#define PCIE_CONF_ADDR(bus, devfn, where)      \
+       (PCIE_CONF_BUS(bus) | PCIE_CONF_DEV(PCI_SLOT(devfn))    | \
+        PCIE_CONF_FUNC(PCI_FUNC(devfn)) | PCIE_CONF_REG(where))
+
+#define PIO_TIMEOUT_MS                 1
+
+#define LINK_WAIT_MAX_RETRIES          10
+#define LINK_WAIT_USLEEP_MIN           90000
+#define LINK_WAIT_USLEEP_MAX           100000
+
+#define LEGACY_IRQ_NUM                 4
+#define MSI_IRQ_NUM                    32
+
+struct advk_pcie {
+       struct platform_device *pdev;
+       void __iomem *base;
+       struct list_head resources;
+       struct irq_domain *irq_domain;
+       struct irq_chip irq_chip;
+       struct msi_controller msi;
+       struct irq_domain *msi_domain;
+       struct irq_chip msi_irq_chip;
+       DECLARE_BITMAP(msi_irq_in_use, MSI_IRQ_NUM);
+       struct mutex msi_used_lock;
+       u16 msi_msg;
+       int root_bus_nr;
+};
+
+static inline void advk_writel(struct advk_pcie *pcie, u32 val, u64 reg)
+{
+       writel(val, pcie->base + reg);
+}
+
+static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg)
+{
+       return readl(pcie->base + reg);
+}
+
+static int advk_pcie_link_up(struct advk_pcie *pcie)
+{
+       u32 val, ltssm_state;
+
+       val = advk_readl(pcie, CFG_REG);
+       ltssm_state = (val >> LTSSM_SHIFT) & LTSSM_MASK;
+       return ltssm_state >= LTSSM_L0;
+}
+
+static int advk_pcie_wait_for_link(struct advk_pcie *pcie)
+{
+       int retries;
+
+       /* check if the link is up or not */
+       for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) {
+               if (advk_pcie_link_up(pcie)) {
+                       dev_info(&pcie->pdev->dev, "link up\n");
+                       return 0;
+               }
+
+               usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
+       }
+
+       dev_err(&pcie->pdev->dev, "link never came up\n");
+
+       return -ETIMEDOUT;
+}
+
+/*
+ * Set PCIe address window register which could be used for memory
+ * mapping.
+ */
+static void advk_pcie_set_ob_win(struct advk_pcie *pcie,
+                                u32 win_num, u32 match_ms,
+                                u32 match_ls, u32 mask_ms,
+                                u32 mask_ls, u32 remap_ms,
+                                u32 remap_ls, u32 action)
+{
+       advk_writel(pcie, match_ls, OB_WIN_MATCH_LS(win_num));
+       advk_writel(pcie, match_ms, OB_WIN_MATCH_MS(win_num));
+       advk_writel(pcie, mask_ms, OB_WIN_MASK_MS(win_num));
+       advk_writel(pcie, mask_ls, OB_WIN_MASK_LS(win_num));
+       advk_writel(pcie, remap_ms, OB_WIN_REMAP_MS(win_num));
+       advk_writel(pcie, remap_ls, OB_WIN_REMAP_LS(win_num));
+       advk_writel(pcie, action, OB_WIN_ACTIONS(win_num));
+       advk_writel(pcie, match_ls | BIT(0), OB_WIN_MATCH_LS(win_num));
+}
+
+static void advk_pcie_setup_hw(struct advk_pcie *pcie)
+{
+       u32 reg;
+       int i;
+
+       /* Point PCIe unit MBUS decode windows to DRAM space */
+       for (i = 0; i < 8; i++)
+               advk_pcie_set_ob_win(pcie, i, 0, 0, 0, 0, 0, 0, 0);
+
+       /* Set to Direct mode */
+       reg = advk_readl(pcie, CTRL_CONFIG_REG);
+       reg &= ~(CTRL_MODE_MASK << CTRL_MODE_SHIFT);
+       reg |= ((PCIE_CORE_MODE_DIRECT & CTRL_MODE_MASK) << CTRL_MODE_SHIFT);
+       advk_writel(pcie, reg, CTRL_CONFIG_REG);
+
+       /* Set PCI global control register to RC mode */
+       reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
+       reg |= (IS_RC_MSK << IS_RC_SHIFT);
+       advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
+
+       /* Set Advanced Error Capabilities and Control PF0 register */
+       reg = PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX |
+               PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN |
+               PCIE_CORE_ERR_CAPCTL_ECRC_CHCK |
+               PCIE_CORE_ERR_CAPCTL_ECRC_CHCK_RCV;
+       advk_writel(pcie, reg, PCIE_CORE_ERR_CAPCTL_REG);
+
+       /* Set PCIe Device Control and Status 1 PF0 register */
+       reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE |
+               (7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) |
+               PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE |
+               PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT;
+       advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG);
+
+       /* Program PCIe Control 2 to disable strict ordering */
+       reg = PCIE_CORE_CTRL2_RESERVED |
+               PCIE_CORE_CTRL2_TD_ENABLE;
+       advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG);
+
+       /* Set GEN2 */
+       reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
+       reg &= ~PCIE_GEN_SEL_MSK;
+       reg |= SPEED_GEN_2;
+       advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
+
+       /* Set lane X1 */
+       reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
+       reg &= ~LANE_CNT_MSK;
+       reg |= LANE_COUNT_1;
+       advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
+
+       /* Enable link training */
+       reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
+       reg |= LINK_TRAINING_EN;
+       advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
+
+       /* Enable MSI */
+       reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG);
+       reg |= PCIE_CORE_CTRL2_MSI_ENABLE;
+       advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG);
+
+       /* Clear all interrupts */
+       advk_writel(pcie, PCIE_ISR0_ALL_MASK, PCIE_ISR0_REG);
+       advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_REG);
+       advk_writel(pcie, PCIE_IRQ_ALL_MASK, HOST_CTRL_INT_STATUS_REG);
+
+       /* Disable All ISR0/1 Sources */
+       reg = PCIE_ISR0_ALL_MASK;
+       reg &= ~PCIE_ISR0_MSI_INT_PENDING;
+       advk_writel(pcie, reg, PCIE_ISR0_MASK_REG);
+
+       advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_MASK_REG);
+
+       /* Unmask all MSI's */
+       advk_writel(pcie, 0, PCIE_MSI_MASK_REG);
+
+       /* Enable summary interrupt for GIC SPI source */
+       reg = PCIE_IRQ_ALL_MASK & (~PCIE_IRQ_ENABLE_INTS_MASK);
+       advk_writel(pcie, reg, HOST_CTRL_INT_MASK_REG);
+
+       reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG);
+       reg |= PCIE_CORE_CTRL2_OB_WIN_ENABLE;
+       advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG);
+
+       /* Bypass the address window mapping for PIO */
+       reg = advk_readl(pcie, PIO_CTRL);
+       reg |= PIO_CTRL_ADDR_WIN_DISABLE;
+       advk_writel(pcie, reg, PIO_CTRL);
+
+       /* Start link training */
+       reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG);
+       reg |= PCIE_CORE_LINK_TRAINING;
+       advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG);
+
+       advk_pcie_wait_for_link(pcie);
+
+       reg = PCIE_CORE_LINK_L0S_ENTRY |
+               (1 << PCIE_CORE_LINK_WIDTH_SHIFT);
+       advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG);
+
+       reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG);
+       reg |= PCIE_CORE_CMD_MEM_ACCESS_EN |
+               PCIE_CORE_CMD_IO_ACCESS_EN |
+               PCIE_CORE_CMD_MEM_IO_REQ_EN;
+       advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG);
+}
+
+static void advk_pcie_check_pio_status(struct advk_pcie *pcie)
+{
+       u32 reg;
+       unsigned int status;
+       char *strcomp_status, *str_posted;
+
+       reg = advk_readl(pcie, PIO_STAT);
+       status = (reg & PIO_COMPLETION_STATUS_MASK) >>
+               PIO_COMPLETION_STATUS_SHIFT;
+
+       if (!status)
+               return;
+
+       switch (status) {
+       case PIO_COMPLETION_STATUS_UR:
+               strcomp_status = "UR";
+               break;
+       case PIO_COMPLETION_STATUS_CRS:
+               strcomp_status = "CRS";
+               break;
+       case PIO_COMPLETION_STATUS_CA:
+               strcomp_status = "CA";
+               break;
+       default:
+               strcomp_status = "Unknown";
+               break;
+       }
+
+       if (reg & PIO_NON_POSTED_REQ)
+               str_posted = "Non-posted";
+       else
+               str_posted = "Posted";
+
+       dev_err(&pcie->pdev->dev, "%s PIO Response Status: %s, %#x @ %#x\n",
+               str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS));
+}
+
+static int advk_pcie_wait_pio(struct advk_pcie *pcie)
+{
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(PIO_TIMEOUT_MS);
+
+       while (time_before(jiffies, timeout)) {
+               u32 start, isr;
+
+               start = advk_readl(pcie, PIO_START);
+               isr = advk_readl(pcie, PIO_ISR);
+               if (!start && isr)
+                       return 0;
+       }
+
+       dev_err(&pcie->pdev->dev, "config read/write timed out\n");
+       return -ETIMEDOUT;
+}
+
+static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn,
+                            int where, int size, u32 *val)
+{
+       struct advk_pcie *pcie = bus->sysdata;
+       u32 reg;
+       int ret;
+
+       if (PCI_SLOT(devfn) != 0) {
+               *val = 0xffffffff;
+               return PCIBIOS_DEVICE_NOT_FOUND;
+       }
+
+       /* Start PIO */
+       advk_writel(pcie, 0, PIO_START);
+       advk_writel(pcie, 1, PIO_ISR);
+
+       /* Program the control register */
+       reg = advk_readl(pcie, PIO_CTRL);
+       reg &= ~PIO_CTRL_TYPE_MASK;
+       if (bus->number ==  pcie->root_bus_nr)
+               reg |= PCIE_CONFIG_RD_TYPE0;
+       else
+               reg |= PCIE_CONFIG_RD_TYPE1;
+       advk_writel(pcie, reg, PIO_CTRL);
+
+       /* Program the address registers */
+       reg = PCIE_BDF(devfn) | PCIE_CONF_REG(where);
+       advk_writel(pcie, reg, PIO_ADDR_LS);
+       advk_writel(pcie, 0, PIO_ADDR_MS);
+
+       /* Program the data strobe */
+       advk_writel(pcie, 0xf, PIO_WR_DATA_STRB);
+
+       /* Start the transfer */
+       advk_writel(pcie, 1, PIO_START);
+
+       ret = advk_pcie_wait_pio(pcie);
+       if (ret < 0)
+               return PCIBIOS_SET_FAILED;
+
+       advk_pcie_check_pio_status(pcie);
+
+       /* Get the read result */
+       *val = advk_readl(pcie, PIO_RD_DATA);
+       if (size == 1)
+               *val = (*val >> (8 * (where & 3))) & 0xff;
+       else if (size == 2)
+               *val = (*val >> (8 * (where & 3))) & 0xffff;
+
+       return PCIBIOS_SUCCESSFUL;
+}
+
+static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn,
+                               int where, int size, u32 val)
+{
+       struct advk_pcie *pcie = bus->sysdata;
+       u32 reg;
+       u32 data_strobe = 0x0;
+       int offset;
+       int ret;
+
+       if (PCI_SLOT(devfn) != 0)
+               return PCIBIOS_DEVICE_NOT_FOUND;
+
+       if (where % size)
+               return PCIBIOS_SET_FAILED;
+
+       /* Start PIO */
+       advk_writel(pcie, 0, PIO_START);
+       advk_writel(pcie, 1, PIO_ISR);
+
+       /* Program the control register */
+       reg = advk_readl(pcie, PIO_CTRL);
+       reg &= ~PIO_CTRL_TYPE_MASK;
+       if (bus->number == pcie->root_bus_nr)
+               reg |= PCIE_CONFIG_WR_TYPE0;
+       else
+               reg |= PCIE_CONFIG_WR_TYPE1;
+       advk_writel(pcie, reg, PIO_CTRL);
+
+       /* Program the address registers */
+       reg = PCIE_CONF_ADDR(bus->number, devfn, where);
+       advk_writel(pcie, reg, PIO_ADDR_LS);
+       advk_writel(pcie, 0, PIO_ADDR_MS);
+
+       /* Calculate the write strobe */
+       offset      = where & 0x3;
+       reg         = val << (8 * offset);
+       data_strobe = GENMASK(size - 1, 0) << offset;
+
+       /* Program the data register */
+       advk_writel(pcie, reg, PIO_WR_DATA);
+
+       /* Program the data strobe */
+       advk_writel(pcie, data_strobe, PIO_WR_DATA_STRB);
+
+       /* Start the transfer */
+       advk_writel(pcie, 1, PIO_START);
+
+       ret = advk_pcie_wait_pio(pcie);
+       if (ret < 0)
+               return PCIBIOS_SET_FAILED;
+
+       advk_pcie_check_pio_status(pcie);
+
+       return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops advk_pcie_ops = {
+       .read = advk_pcie_rd_conf,
+       .write = advk_pcie_wr_conf,
+};
+
+static int advk_pcie_alloc_msi(struct advk_pcie *pcie)
+{
+       int hwirq;
+
+       mutex_lock(&pcie->msi_used_lock);
+       hwirq = find_first_zero_bit(pcie->msi_irq_in_use, MSI_IRQ_NUM);
+       if (hwirq >= MSI_IRQ_NUM)
+               hwirq = -ENOSPC;
+       else
+               set_bit(hwirq, pcie->msi_irq_in_use);
+       mutex_unlock(&pcie->msi_used_lock);
+
+       return hwirq;
+}
+
+static void advk_pcie_free_msi(struct advk_pcie *pcie, int hwirq)
+{
+       mutex_lock(&pcie->msi_used_lock);
+       if (!test_bit(hwirq, pcie->msi_irq_in_use))
+               dev_err(&pcie->pdev->dev, "trying to free unused MSI#%d\n",
+                       hwirq);
+       else
+               clear_bit(hwirq, pcie->msi_irq_in_use);
+       mutex_unlock(&pcie->msi_used_lock);
+}
+
+static int advk_pcie_setup_msi_irq(struct msi_controller *chip,
+                                  struct pci_dev *pdev,
+                                  struct msi_desc *desc)
+{
+       struct advk_pcie *pcie = pdev->bus->sysdata;
+       struct msi_msg msg;
+       int virq, hwirq;
+       phys_addr_t msi_msg_phys;
+
+       /* We support MSI, but not MSI-X */
+       if (desc->msi_attrib.is_msix)
+               return -EINVAL;
+
+       hwirq = advk_pcie_alloc_msi(pcie);
+       if (hwirq < 0)
+               return hwirq;
+
+       virq = irq_create_mapping(pcie->msi_domain, hwirq);
+       if (!virq) {
+               advk_pcie_free_msi(pcie, hwirq);
+               return -EINVAL;
+       }
+
+       irq_set_msi_desc(virq, desc);
+
+       msi_msg_phys = virt_to_phys(&pcie->msi_msg);
+
+       msg.address_lo = lower_32_bits(msi_msg_phys);
+       msg.address_hi = upper_32_bits(msi_msg_phys);
+       msg.data = virq;
+
+       pci_write_msi_msg(virq, &msg);
+
+       return 0;
+}
+
+static void advk_pcie_teardown_msi_irq(struct msi_controller *chip,
+                                      unsigned int irq)
+{
+       struct irq_data *d = irq_get_irq_data(irq);
+       struct msi_desc *msi = irq_data_get_msi_desc(d);
+       struct advk_pcie *pcie = msi_desc_to_pci_sysdata(msi);
+       unsigned long hwirq = d->hwirq;
+
+       irq_dispose_mapping(irq);
+       advk_pcie_free_msi(pcie, hwirq);
+}
+
+static int advk_pcie_msi_map(struct irq_domain *domain,
+                            unsigned int virq, irq_hw_number_t hw)
+{
+       struct advk_pcie *pcie = domain->host_data;
+
+       irq_set_chip_and_handler(virq, &pcie->msi_irq_chip,
+                                handle_simple_irq);
+
+       return 0;
+}
+
+static const struct irq_domain_ops advk_pcie_msi_irq_ops = {
+       .map = advk_pcie_msi_map,
+};
+
+static void advk_pcie_irq_mask(struct irq_data *d)
+{
+       struct advk_pcie *pcie = d->domain->host_data;
+       irq_hw_number_t hwirq = irqd_to_hwirq(d);
+       u32 mask;
+
+       mask = advk_readl(pcie, PCIE_ISR0_MASK_REG);
+       mask |= PCIE_ISR0_INTX_ASSERT(hwirq);
+       advk_writel(pcie, mask, PCIE_ISR0_MASK_REG);
+}
+
+static void advk_pcie_irq_unmask(struct irq_data *d)
+{
+       struct advk_pcie *pcie = d->domain->host_data;
+       irq_hw_number_t hwirq = irqd_to_hwirq(d);
+       u32 mask;
+
+       mask = advk_readl(pcie, PCIE_ISR0_MASK_REG);
+       mask &= ~PCIE_ISR0_INTX_ASSERT(hwirq);
+       advk_writel(pcie, mask, PCIE_ISR0_MASK_REG);
+}
+
+static int advk_pcie_irq_map(struct irq_domain *h,
+                            unsigned int virq, irq_hw_number_t hwirq)
+{
+       struct advk_pcie *pcie = h->host_data;
+
+       advk_pcie_irq_mask(irq_get_irq_data(virq));
+       irq_set_status_flags(virq, IRQ_LEVEL);
+       irq_set_chip_and_handler(virq, &pcie->irq_chip,
+                                handle_level_irq);
+       irq_set_chip_data(virq, pcie);
+
+       return 0;
+}
+
+static const struct irq_domain_ops advk_pcie_irq_domain_ops = {
+       .map = advk_pcie_irq_map,
+       .xlate = irq_domain_xlate_onecell,
+};
+
+static int advk_pcie_init_msi_irq_domain(struct advk_pcie *pcie)
+{
+       struct device *dev = &pcie->pdev->dev;
+       struct device_node *node = dev->of_node;
+       struct irq_chip *msi_irq_chip;
+       struct msi_controller *msi;
+       phys_addr_t msi_msg_phys;
+       int ret;
+
+       msi_irq_chip = &pcie->msi_irq_chip;
+
+       msi_irq_chip->name = devm_kasprintf(dev, GFP_KERNEL, "%s-msi",
+                                           dev_name(dev));
+       if (!msi_irq_chip->name)
+               return -ENOMEM;
+
+       msi_irq_chip->irq_enable = pci_msi_unmask_irq;
+       msi_irq_chip->irq_disable = pci_msi_mask_irq;
+       msi_irq_chip->irq_mask = pci_msi_mask_irq;
+       msi_irq_chip->irq_unmask = pci_msi_unmask_irq;
+
+       msi = &pcie->msi;
+
+       msi->setup_irq = advk_pcie_setup_msi_irq;
+       msi->teardown_irq = advk_pcie_teardown_msi_irq;
+       msi->of_node = node;
+
+       mutex_init(&pcie->msi_used_lock);
+
+       msi_msg_phys = virt_to_phys(&pcie->msi_msg);
+
+       advk_writel(pcie, lower_32_bits(msi_msg_phys),
+                   PCIE_MSI_ADDR_LOW_REG);
+       advk_writel(pcie, upper_32_bits(msi_msg_phys),
+                   PCIE_MSI_ADDR_HIGH_REG);
+
+       pcie->msi_domain =
+               irq_domain_add_linear(NULL, MSI_IRQ_NUM,
+                                     &advk_pcie_msi_irq_ops, pcie);
+       if (!pcie->msi_domain)
+               return -ENOMEM;
+
+       ret = of_pci_msi_chip_add(msi);
+       if (ret < 0) {
+               irq_domain_remove(pcie->msi_domain);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void advk_pcie_remove_msi_irq_domain(struct advk_pcie *pcie)
+{
+       of_pci_msi_chip_remove(&pcie->msi);
+       irq_domain_remove(pcie->msi_domain);
+}
+
+static int advk_pcie_init_irq_domain(struct advk_pcie *pcie)
+{
+       struct device *dev = &pcie->pdev->dev;
+       struct device_node *node = dev->of_node;
+       struct device_node *pcie_intc_node;
+       struct irq_chip *irq_chip;
+
+       pcie_intc_node =  of_get_next_child(node, NULL);
+       if (!pcie_intc_node) {
+               dev_err(dev, "No PCIe Intc node found\n");
+               return -ENODEV;
+       }
+
+       irq_chip = &pcie->irq_chip;
+
+       irq_chip->name = devm_kasprintf(dev, GFP_KERNEL, "%s-irq",
+                                       dev_name(dev));
+       if (!irq_chip->name) {
+               of_node_put(pcie_intc_node);
+               return -ENOMEM;
+       }
+
+       irq_chip->irq_mask = advk_pcie_irq_mask;
+       irq_chip->irq_mask_ack = advk_pcie_irq_mask;
+       irq_chip->irq_unmask = advk_pcie_irq_unmask;
+
+       pcie->irq_domain =
+               irq_domain_add_linear(pcie_intc_node, LEGACY_IRQ_NUM,
+                                     &advk_pcie_irq_domain_ops, pcie);
+       if (!pcie->irq_domain) {
+               dev_err(dev, "Failed to get a INTx IRQ domain\n");
+               of_node_put(pcie_intc_node);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void advk_pcie_remove_irq_domain(struct advk_pcie *pcie)
+{
+       irq_domain_remove(pcie->irq_domain);
+}
+
+static void advk_pcie_handle_msi(struct advk_pcie *pcie)
+{
+       u32 msi_val, msi_mask, msi_status, msi_idx;
+       u16 msi_data;
+
+       msi_mask = advk_readl(pcie, PCIE_MSI_MASK_REG);
+       msi_val = advk_readl(pcie, PCIE_MSI_STATUS_REG);
+       msi_status = msi_val & ~msi_mask;
+
+       for (msi_idx = 0; msi_idx < MSI_IRQ_NUM; msi_idx++) {
+               if (!(BIT(msi_idx) & msi_status))
+                       continue;
+
+               advk_writel(pcie, BIT(msi_idx), PCIE_MSI_STATUS_REG);
+               msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & 0xFF;
+               generic_handle_irq(msi_data);
+       }
+
+       advk_writel(pcie, PCIE_ISR0_MSI_INT_PENDING,
+                   PCIE_ISR0_REG);
+}
+
+static void advk_pcie_handle_int(struct advk_pcie *pcie)
+{
+       u32 val, mask, status;
+       int i, virq;
+
+       val = advk_readl(pcie, PCIE_ISR0_REG);
+       mask = advk_readl(pcie, PCIE_ISR0_MASK_REG);
+       status = val & ((~mask) & PCIE_ISR0_ALL_MASK);
+
+       if (!status) {
+               advk_writel(pcie, val, PCIE_ISR0_REG);
+               return;
+       }
+
+       /* Process MSI interrupts */
+       if (status & PCIE_ISR0_MSI_INT_PENDING)
+               advk_pcie_handle_msi(pcie);
+
+       /* Process legacy interrupts */
+       for (i = 0; i < LEGACY_IRQ_NUM; i++) {
+               if (!(status & PCIE_ISR0_INTX_ASSERT(i)))
+                       continue;
+
+               advk_writel(pcie, PCIE_ISR0_INTX_ASSERT(i),
+                           PCIE_ISR0_REG);
+
+               virq = irq_find_mapping(pcie->irq_domain, i);
+               generic_handle_irq(virq);
+       }
+}
+
+static irqreturn_t advk_pcie_irq_handler(int irq, void *arg)
+{
+       struct advk_pcie *pcie = arg;
+       u32 status;
+
+       status = advk_readl(pcie, HOST_CTRL_INT_STATUS_REG);
+       if (!(status & PCIE_IRQ_CORE_INT))
+               return IRQ_NONE;
+
+       advk_pcie_handle_int(pcie);
+
+       /* Clear interrupt */
+       advk_writel(pcie, PCIE_IRQ_CORE_INT, HOST_CTRL_INT_STATUS_REG);
+
+       return IRQ_HANDLED;
+}
+
+static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie)
+{
+       int err, res_valid = 0;
+       struct device *dev = &pcie->pdev->dev;
+       struct device_node *np = dev->of_node;
+       struct resource_entry *win;
+       resource_size_t iobase;
+
+       INIT_LIST_HEAD(&pcie->resources);
+
+       err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pcie->resources,
+                                              &iobase);
+       if (err)
+               return err;
+
+       err = devm_request_pci_bus_resources(dev, &pcie->resources);
+       if (err)
+               goto out_release_res;
+
+       resource_list_for_each_entry(win, &pcie->resources) {
+               struct resource *res = win->res;
+
+               switch (resource_type(res)) {
+               case IORESOURCE_IO:
+                       advk_pcie_set_ob_win(pcie, 1,
+                                            upper_32_bits(res->start),
+                                            lower_32_bits(res->start),
+                                            0, 0xF8000000, 0,
+                                            lower_32_bits(res->start),
+                                            OB_PCIE_IO);
+                       err = pci_remap_iospace(res, iobase);
+                       if (err)
+                               dev_warn(dev, "error %d: failed to map resource %pR\n",
+                                        err, res);
+                       break;
+               case IORESOURCE_MEM:
+                       advk_pcie_set_ob_win(pcie, 0,
+                                            upper_32_bits(res->start),
+                                            lower_32_bits(res->start),
+                                            0x0, 0xF8000000, 0,
+                                            lower_32_bits(res->start),
+                                            (2 << 20) | OB_PCIE_MEM);
+                       res_valid |= !(res->flags & IORESOURCE_PREFETCH);
+                       break;
+               case IORESOURCE_BUS:
+                       pcie->root_bus_nr = res->start;
+                       break;
+               }
+       }
+
+       if (!res_valid) {
+               dev_err(dev, "non-prefetchable memory resource required\n");
+               err = -EINVAL;
+               goto out_release_res;
+       }
+
+       return 0;
+
+out_release_res:
+       pci_free_resource_list(&pcie->resources);
+       return err;
+}
+
+static int advk_pcie_probe(struct platform_device *pdev)
+{
+       struct advk_pcie *pcie;
+       struct resource *res;
+       struct pci_bus *bus, *child;
+       struct msi_controller *msi;
+       struct device_node *msi_node;
+       int ret, irq;
+
+       pcie = devm_kzalloc(&pdev->dev, sizeof(struct advk_pcie),
+                           GFP_KERNEL);
+       if (!pcie)
+               return -ENOMEM;
+
+       pcie->pdev = pdev;
+       platform_set_drvdata(pdev, pcie);
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       pcie->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(pcie->base)) {
+               dev_err(&pdev->dev, "Failed to map registers\n");
+               return PTR_ERR(pcie->base);
+       }
+
+       irq = platform_get_irq(pdev, 0);
+       ret = devm_request_irq(&pdev->dev, irq, advk_pcie_irq_handler,
+                              IRQF_SHARED | IRQF_NO_THREAD, "advk-pcie",
+                              pcie);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to register interrupt\n");
+               return ret;
+       }
+
+       ret = advk_pcie_parse_request_of_pci_ranges(pcie);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to parse resources\n");
+               return ret;
+       }
+
+       advk_pcie_setup_hw(pcie);
+
+       ret = advk_pcie_init_irq_domain(pcie);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to initialize irq\n");
+               return ret;
+       }
+
+       ret = advk_pcie_init_msi_irq_domain(pcie);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to initialize irq\n");
+               advk_pcie_remove_irq_domain(pcie);
+               return ret;
+       }
+
+       msi_node = of_parse_phandle(pdev->dev.of_node, "msi-parent", 0);
+       if (msi_node)
+               msi = of_pci_find_msi_chip_by_node(msi_node);
+       else
+               msi = NULL;
+
+       bus = pci_scan_root_bus_msi(&pdev->dev, 0, &advk_pcie_ops,
+                                   pcie, &pcie->resources, &pcie->msi);
+       if (!bus) {
+               advk_pcie_remove_msi_irq_domain(pcie);
+               advk_pcie_remove_irq_domain(pcie);
+               return -ENOMEM;
+       }
+
+       pci_bus_assign_resources(bus);
+
+       list_for_each_entry(child, &bus->children, node)
+               pcie_bus_configure_settings(child);
+
+       pci_bus_add_devices(bus);
+
+       return 0;
+}
+
+static const struct of_device_id advk_pcie_of_match_table[] = {
+       { .compatible = "marvell,armada-3700-pcie", },
+       {},
+};
+
+static struct platform_driver advk_pcie_driver = {
+       .driver = {
+               .name = "advk-pcie",
+               .of_match_table = advk_pcie_of_match_table,
+               /* Driver unloading/unbinding currently not supported */
+               .suppress_bind_attrs = true,
+       },
+       .probe = advk_pcie_probe,
+};
+builtin_platform_driver(advk_pcie_driver);
index f441130..81b3949 100644 (file)
@@ -181,14 +181,14 @@ static int dra7xx_pcie_init_irq_domain(struct pcie_port *pp)
 
        if (!pcie_intc_node) {
                dev_err(dev, "No PCIe Intc node found\n");
-               return PTR_ERR(pcie_intc_node);
+               return -ENODEV;
        }
 
        pp->irq_domain = irq_domain_add_linear(pcie_intc_node, 4,
                                               &intx_domain_ops, pp);
        if (!pp->irq_domain) {
                dev_err(dev, "Failed to get a INTx IRQ domain\n");
-               return PTR_ERR(pp->irq_domain);
+               return -ENODEV;
        }
 
        return 0;
index 8cba7ab..9d9d34e 100644 (file)
 #include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/platform_device.h>
 
-#include "../ecam.h"
-
 static int gen_pci_parse_request_of_pci_ranges(struct device *dev,
                       struct list_head *resources, struct resource **bus_range)
 {
@@ -36,44 +35,34 @@ static int gen_pci_parse_request_of_pci_ranges(struct device *dev,
        if (err)
                return err;
 
+       err = devm_request_pci_bus_resources(dev, resources);
+       if (err)
+               return err;
+
        resource_list_for_each_entry(win, resources) {
-               struct resource *parent, *res = win->res;
+               struct resource *res = win->res;
 
                switch (resource_type(res)) {
                case IORESOURCE_IO:
-                       parent = &ioport_resource;
                        err = pci_remap_iospace(res, iobase);
-                       if (err) {
+                       if (err)
                                dev_warn(dev, "error %d: failed to map resource %pR\n",
                                         err, res);
-                               continue;
-                       }
                        break;
                case IORESOURCE_MEM:
-                       parent = &iomem_resource;
                        res_valid |= !(res->flags & IORESOURCE_PREFETCH);
                        break;
                case IORESOURCE_BUS:
                        *bus_range = res;
-               default:
-                       continue;
+                       break;
                }
-
-               err = devm_request_resource(dev, parent, res);
-               if (err)
-                       goto out_release_res;
-       }
-
-       if (!res_valid) {
-               dev_err(dev, "non-prefetchable memory resource required\n");
-               err = -EINVAL;
-               goto out_release_res;
        }
 
-       return 0;
+       if (res_valid)
+               return 0;
 
-out_release_res:
-       return err;
+       dev_err(dev, "non-prefetchable memory resource required\n");
+       return -EINVAL;
 }
 
 static void gen_pci_unmap_cfg(void *ptr)
@@ -155,7 +144,14 @@ int pci_host_common_probe(struct platform_device *pdev,
 
        pci_fixup_irqs(pci_common_swizzle, of_irq_parse_and_map_pci);
 
-       if (!pci_has_flag(PCI_PROBE_ONLY)) {
+       /*
+        * We insert PCI resources into the iomem_resource and
+        * ioport_resource trees in either pci_bus_claim_resources()
+        * or pci_bus_assign_resources().
+        */
+       if (pci_has_flag(PCI_PROBE_ONLY)) {
+               pci_bus_claim_resources(bus);
+       } else {
                pci_bus_size_bridges(bus);
                pci_bus_assign_resources(bus);
 
index 6eaceab..c05ea9d 100644 (file)
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/of_address.h>
 #include <linux/of_pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/platform_device.h>
 
-#include "../ecam.h"
-
 static struct pci_ecam_ops gen_pci_cfg_cam_bus_ops = {
        .bus_shift      = 16,
        .pci_ops        = {
@@ -46,8 +45,6 @@ static const struct of_device_id gen_pci_of_match[] = {
        { },
 };
 
-MODULE_DEVICE_TABLE(of, gen_pci_of_match);
-
 static int gen_pci_probe(struct platform_device *pdev)
 {
        const struct of_device_id *of_id;
@@ -66,8 +63,4 @@ static struct platform_driver gen_pci_driver = {
        },
        .probe = gen_pci_probe,
 };
-module_platform_driver(gen_pci_driver);
-
-MODULE_DESCRIPTION("Generic PCI host driver");
-MODULE_AUTHOR("Will Deacon <will.deacon@arm.com>");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(gen_pci_driver);
index 7e9b2de..6955ffd 100644 (file)
@@ -732,16 +732,18 @@ static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
 
        pdev = msi_desc_to_pci_dev(msi);
        hbus = info->data;
-       hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
-       if (!hpdev)
+       int_desc = irq_data_get_irq_chip_data(irq_data);
+       if (!int_desc)
                return;
 
-       int_desc = irq_data_get_irq_chip_data(irq_data);
-       if (int_desc) {
-               irq_data->chip_data = NULL;
-               hv_int_desc_free(hpdev, int_desc);
+       irq_data->chip_data = NULL;
+       hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
+       if (!hpdev) {
+               kfree(int_desc);
+               return;
        }
 
+       hv_int_desc_free(hpdev, int_desc);
        put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 }
 
@@ -1657,14 +1659,16 @@ static void hv_pci_onchannelcallback(void *context)
                        continue;
                }
 
+               /* Zero length indicates there are no more packets. */
+               if (ret || !bytes_recvd)
+                       break;
+
                /*
                 * All incoming packets must be at least as large as a
                 * response.
                 */
-               if (bytes_recvd <= sizeof(struct pci_response)) {
-                       kfree(buffer);
-                       return;
-               }
+               if (bytes_recvd <= sizeof(struct pci_response))
+                       continue;
                desc = (struct vmpacket_descriptor *)buffer;
 
                switch (desc->type) {
@@ -1679,8 +1683,7 @@ static void hv_pci_onchannelcallback(void *context)
                        comp_packet->completion_func(comp_packet->compl_ctxt,
                                                     response,
                                                     bytes_recvd);
-                       kfree(buffer);
-                       return;
+                       break;
 
                case VM_PKT_DATA_INBAND:
 
@@ -1727,8 +1730,9 @@ static void hv_pci_onchannelcallback(void *context)
                                desc->type, req_id, bytes_recvd);
                        break;
                }
-               break;
        }
+
+       kfree(buffer);
 }
 
 /**
index 6b8301e..8ba2883 100644 (file)
@@ -17,7 +17,7 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/irqdomain.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/msi.h>
 #include <linux/of_irq.h>
 #include <linux/of.h>
@@ -360,7 +360,6 @@ static const struct of_device_id ks_pcie_of_match[] = {
        },
        { },
 };
-MODULE_DEVICE_TABLE(of, ks_pcie_of_match);
 
 static int __exit ks_pcie_remove(struct platform_device *pdev)
 {
@@ -439,9 +438,4 @@ static struct platform_driver ks_pcie_driver __refdata = {
                .of_match_table = of_match_ptr(ks_pcie_of_match),
        },
 };
-
-module_platform_driver(ks_pcie_driver);
-
-MODULE_AUTHOR("Murali Karicheri <m-karicheri2@ti.com>");
-MODULE_DESCRIPTION("Keystone PCIe host controller driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(ks_pcie_driver);
index a21e229..114ba81 100644 (file)
@@ -12,7 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/of_pci.h>
 #include <linux/of_platform.h>
 #include <linux/of_irq.h>
@@ -211,7 +211,6 @@ static const struct of_device_id ls_pcie_of_match[] = {
        { .compatible = "fsl,ls2085a-pcie", .data = &ls2080_drvdata },
        { },
 };
-MODULE_DEVICE_TABLE(of, ls_pcie_of_match);
 
 static int __init ls_add_pcie_port(struct pcie_port *pp,
                                   struct platform_device *pdev)
@@ -275,9 +274,4 @@ static struct platform_driver ls_pcie_driver = {
                .of_match_table = ls_pcie_of_match,
        },
 };
-
-module_platform_driver_probe(ls_pcie_driver, ls_pcie_probe);
-
-MODULE_AUTHOR("Minghuan Lian <Minghuan.Lian@freescale.com>");
-MODULE_DESCRIPTION("Freescale Layerscape PCIe host controller driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver_probe(ls_pcie_driver, ls_pcie_probe);
index 6b451df..307f81d 100644 (file)
@@ -1,6 +1,8 @@
 /*
  * PCIe driver for Marvell Armada 370 and Armada XP SoCs
  *
+ * Author: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+ *
  * This file is licensed under the terms of the GNU General Public
  * License version 2.  This program is licensed "as is" without any
  * warranty of any kind, whether express or implied.
@@ -11,7 +13,7 @@
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/gpio.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/mbus.h>
 #include <linux/msi.h>
 #include <linux/slab.h>
@@ -839,25 +841,22 @@ static struct pci_ops mvebu_pcie_ops = {
 static int mvebu_pcie_setup(int nr, struct pci_sys_data *sys)
 {
        struct mvebu_pcie *pcie = sys_to_pcie(sys);
-       int i;
+       int err, i;
 
        pcie->mem.name = "PCI MEM";
        pcie->realio.name = "PCI I/O";
 
-       if (request_resource(&iomem_resource, &pcie->mem))
-               return 0;
-
-       if (resource_size(&pcie->realio) != 0) {
-               if (request_resource(&ioport_resource, &pcie->realio)) {
-                       release_resource(&pcie->mem);
-                       return 0;
-               }
+       if (resource_size(&pcie->realio) != 0)
                pci_add_resource_offset(&sys->resources, &pcie->realio,
                                        sys->io_offset);
-       }
+
        pci_add_resource_offset(&sys->resources, &pcie->mem, sys->mem_offset);
        pci_add_resource(&sys->resources, &pcie->busn);
 
+       err = devm_request_pci_bus_resources(&pcie->pdev->dev, &sys->resources);
+       if (err)
+               return 0;
+
        for (i = 0; i < pcie->nports; i++) {
                struct mvebu_pcie_port *port = &pcie->ports[i];
 
@@ -1298,7 +1297,6 @@ static const struct of_device_id mvebu_pcie_of_match_table[] = {
        { .compatible = "marvell,kirkwood-pcie", },
        {},
 };
-MODULE_DEVICE_TABLE(of, mvebu_pcie_of_match_table);
 
 static const struct dev_pm_ops mvebu_pcie_pm_ops = {
        SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(mvebu_pcie_suspend, mvebu_pcie_resume)
@@ -1314,8 +1312,4 @@ static struct platform_driver mvebu_pcie_driver = {
        },
        .probe = mvebu_pcie_probe,
 };
-module_platform_driver(mvebu_pcie_driver);
-
-MODULE_AUTHOR("Thomas Petazzoni <thomas.petazzoni@free-electrons.com>");
-MODULE_DESCRIPTION("Marvell EBU PCIe driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(mvebu_pcie_driver);
index 9980a4b..597566f 100644 (file)
@@ -4,6 +4,8 @@
  * Copyright (C) 2013 Renesas Solutions Corp.
  * Copyright (C) 2013 Cogent Embedded, Inc.
  *
+ * Author: Valentine Barshak <valentine.barshak@cogentembedded.com>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -14,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_pci.h>
 #include <linux/pci.h>
@@ -97,7 +98,6 @@
 struct rcar_pci_priv {
        struct device *dev;
        void __iomem *reg;
-       struct resource io_res;
        struct resource mem_res;
        struct resource *cfg_res;
        unsigned busnr;
@@ -194,6 +194,7 @@ static int rcar_pci_setup(int nr, struct pci_sys_data *sys)
        struct rcar_pci_priv *priv = sys->private_data;
        void __iomem *reg = priv->reg;
        u32 val;
+       int ret;
 
        pm_runtime_enable(priv->dev);
        pm_runtime_get_sync(priv->dev);
@@ -273,8 +274,10 @@ static int rcar_pci_setup(int nr, struct pci_sys_data *sys)
                rcar_pci_setup_errirq(priv);
 
        /* Add PCI resources */
-       pci_add_resource(&sys->resources, &priv->io_res);
        pci_add_resource(&sys->resources, &priv->mem_res);
+       ret = devm_request_pci_bus_resources(priv->dev, &sys->resources);
+       if (ret < 0)
+               return ret;
 
        /* Setup bus number based on platform device id / of bus-range */
        sys->busnr = priv->busnr;
@@ -371,14 +374,6 @@ static int rcar_pci_probe(struct platform_device *pdev)
                return -ENOMEM;
 
        priv->mem_res = *mem_res;
-       /*
-        * The controller does not support/use port I/O,
-        * so setup a dummy port I/O region here.
-        */
-       priv->io_res.start = priv->mem_res.start;
-       priv->io_res.end = priv->mem_res.end;
-       priv->io_res.flags = IORESOURCE_IO;
-
        priv->cfg_res = cfg_res;
 
        priv->irq = platform_get_irq(pdev, 0);
@@ -421,6 +416,7 @@ static int rcar_pci_probe(struct platform_device *pdev)
        hw_private[0] = priv;
        memset(&hw, 0, sizeof(hw));
        hw.nr_controllers = ARRAY_SIZE(hw_private);
+       hw.io_optional = 1;
        hw.private_data = hw_private;
        hw.map_irq = rcar_pci_map_irq;
        hw.ops = &rcar_pci_ops;
@@ -437,8 +433,6 @@ static struct of_device_id rcar_pci_of_match[] = {
        { },
 };
 
-MODULE_DEVICE_TABLE(of, rcar_pci_of_match);
-
 static struct platform_driver rcar_pci_driver = {
        .driver = {
                .name = "pci-rcar-gen2",
@@ -447,9 +441,4 @@ static struct platform_driver rcar_pci_driver = {
        },
        .probe = rcar_pci_probe,
 };
-
-module_platform_driver(rcar_pci_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Renesas R-Car Gen2 internal PCI");
-MODULE_AUTHOR("Valentine Barshak <valentine.barshak@cogentembedded.com>");
+builtin_platform_driver(rcar_pci_driver);
index c388468..6de0757 100644 (file)
@@ -9,6 +9,8 @@
  *
  * Bits taken from arch/arm/mach-dove/pcie.c
  *
+ * Author: Thierry Reding <treding@nvidia.com>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -32,7 +34,7 @@
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/msi.h>
 #include <linux/of_address.h>
 #include <linux/of_pci.h>
 
 #define AFI_PEXBIAS_CTRL_0             0x168
 
-#define RP_VEND_XP     0x00000F00
+#define RP_VEND_XP     0x00000f00
 #define  RP_VEND_XP_DL_UP      (1 << 30)
 
-#define RP_PRIV_MISC   0x00000FE0
-#define  RP_PRIV_MISC_PRSNT_MAP_EP_PRSNT (0xE << 0)
-#define  RP_PRIV_MISC_PRSNT_MAP_EP_ABSNT (0xF << 0)
+#define RP_PRIV_MISC   0x00000fe0
+#define  RP_PRIV_MISC_PRSNT_MAP_EP_PRSNT (0xe << 0)
+#define  RP_PRIV_MISC_PRSNT_MAP_EP_ABSNT (0xf << 0)
 
 #define RP_LINK_CONTROL_STATUS                 0x00000090
 #define  RP_LINK_CONTROL_STATUS_DL_LINK_ACTIVE 0x20000000
 #define  RP_LINK_CONTROL_STATUS_LINKSTAT_MASK  0x3fff0000
 
-#define PADS_CTL_SEL           0x0000009C
+#define PADS_CTL_SEL           0x0000009c
 
-#define PADS_CTL               0x000000A0
+#define PADS_CTL               0x000000a0
 #define  PADS_CTL_IDDQ_1L      (1 << 0)
 #define  PADS_CTL_TX_DATA_EN_1L        (1 << 6)
 #define  PADS_CTL_RX_DATA_EN_1L        (1 << 10)
 
-#define PADS_PLL_CTL_TEGRA20                   0x000000B8
-#define PADS_PLL_CTL_TEGRA30                   0x000000B4
+#define PADS_PLL_CTL_TEGRA20                   0x000000b8
+#define PADS_PLL_CTL_TEGRA30                   0x000000b4
 #define  PADS_PLL_CTL_RST_B4SM                 (1 << 1)
 #define  PADS_PLL_CTL_LOCKDET                  (1 << 8)
 #define  PADS_PLL_CTL_REFCLK_MASK              (0x3 << 16)
 #define  PADS_PLL_CTL_TXCLKREF_DIV5            (1 << 20)
 #define  PADS_PLL_CTL_TXCLKREF_BUF_EN          (1 << 22)
 
-#define PADS_REFCLK_CFG0                       0x000000C8
-#define PADS_REFCLK_CFG1                       0x000000CC
-#define PADS_REFCLK_BIAS                       0x000000D0
+#define PADS_REFCLK_CFG0                       0x000000c8
+#define PADS_REFCLK_CFG1                       0x000000cc
+#define PADS_REFCLK_BIAS                       0x000000d0
 
 /*
  * Fields in PADS_REFCLK_CFG*. Those registers form an array of 16-bit
 #define PADS_REFCLK_CFG_PREDI_SHIFT            8  /* 11:8 */
 #define PADS_REFCLK_CFG_DRVI_SHIFT             12 /* 15:12 */
 
-/* Default value provided by HW engineering is 0xfa5c */
-#define PADS_REFCLK_CFG_VALUE \
-       ( \
-               (0x17 << PADS_REFCLK_CFG_TERM_SHIFT)   | \
-               (0    << PADS_REFCLK_CFG_E_TERM_SHIFT) | \
-               (0xa  << PADS_REFCLK_CFG_PREDI_SHIFT)  | \
-               (0xf  << PADS_REFCLK_CFG_DRVI_SHIFT)     \
-       )
-
 struct tegra_msi {
        struct msi_controller chip;
        DECLARE_BITMAP(used, INT_PCI_MSI_NR);
@@ -252,6 +245,8 @@ struct tegra_pcie_soc_data {
        unsigned int msi_base_shift;
        u32 pads_pll_ctl;
        u32 tx_ref_sel;
+       u32 pads_refclk_cfg0;
+       u32 pads_refclk_cfg1;
        bool has_pex_clkreq_en;
        bool has_pex_bias_ctrl;
        bool has_intr_prsnt_sense;
@@ -274,7 +269,6 @@ struct tegra_pcie {
        struct list_head buses;
        struct resource *cs;
 
-       struct resource all;
        struct resource io;
        struct resource pio;
        struct resource mem;
@@ -623,30 +617,21 @@ static int tegra_pcie_setup(int nr, struct pci_sys_data *sys)
        sys->mem_offset = pcie->offset.mem;
        sys->io_offset = pcie->offset.io;
 
-       err = devm_request_resource(pcie->dev, &pcie->all, &pcie->io);
-       if (err < 0)
-               return err;
-
-       err = devm_request_resource(pcie->dev, &ioport_resource, &pcie->pio);
-       if (err < 0)
-               return err;
-
-       err = devm_request_resource(pcie->dev, &pcie->all, &pcie->mem);
+       err = devm_request_resource(pcie->dev, &iomem_resource, &pcie->io);
        if (err < 0)
                return err;
 
-       err = devm_request_resource(pcie->dev, &pcie->all, &pcie->prefetch);
-       if (err)
-               return err;
-
        pci_add_resource_offset(&sys->resources, &pcie->pio, sys->io_offset);
        pci_add_resource_offset(&sys->resources, &pcie->mem, sys->mem_offset);
        pci_add_resource_offset(&sys->resources, &pcie->prefetch,
                                sys->mem_offset);
        pci_add_resource(&sys->resources, &pcie->busn);
 
-       pci_ioremap_io(pcie->pio.start, pcie->io.start);
+       err = devm_request_pci_bus_resources(pcie->dev, &sys->resources);
+       if (err < 0)
+               return err;
 
+       pci_remap_iospace(&pcie->pio, pcie->io.start);
        return 1;
 }
 
@@ -838,12 +823,6 @@ static int tegra_pcie_phy_enable(struct tegra_pcie *pcie)
        value |= PADS_PLL_CTL_RST_B4SM;
        pads_writel(pcie, value, soc->pads_pll_ctl);
 
-       /* Configure the reference clock driver */
-       value = PADS_REFCLK_CFG_VALUE | (PADS_REFCLK_CFG_VALUE << 16);
-       pads_writel(pcie, value, PADS_REFCLK_CFG0);
-       if (soc->num_ports > 2)
-               pads_writel(pcie, PADS_REFCLK_CFG_VALUE, PADS_REFCLK_CFG1);
-
        /* wait for the PLL to lock */
        err = tegra_pcie_pll_wait(pcie, 500);
        if (err < 0) {
@@ -927,6 +906,7 @@ static int tegra_pcie_port_phy_power_off(struct tegra_pcie_port *port)
 
 static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
 {
+       const struct tegra_pcie_soc_data *soc = pcie->soc_data;
        struct tegra_pcie_port *port;
        int err;
 
@@ -952,6 +932,12 @@ static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
                }
        }
 
+       /* Configure the reference clock driver */
+       pads_writel(pcie, soc->pads_refclk_cfg0, PADS_REFCLK_CFG0);
+
+       if (soc->num_ports > 2)
+               pads_writel(pcie, soc->pads_refclk_cfg1, PADS_REFCLK_CFG1);
+
        return 0;
 }
 
@@ -1822,12 +1808,6 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
        struct resource res;
        int err;
 
-       memset(&pcie->all, 0, sizeof(pcie->all));
-       pcie->all.flags = IORESOURCE_MEM;
-       pcie->all.name = np->full_name;
-       pcie->all.start = ~0;
-       pcie->all.end = 0;
-
        if (of_pci_range_parser_init(&parser, np)) {
                dev_err(pcie->dev, "missing \"ranges\" property\n");
                return -EINVAL;
@@ -1880,18 +1860,8 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
                        }
                        break;
                }
-
-               if (res.start <= pcie->all.start)
-                       pcie->all.start = res.start;
-
-               if (res.end >= pcie->all.end)
-                       pcie->all.end = res.end;
        }
 
-       err = devm_request_resource(pcie->dev, &iomem_resource, &pcie->all);
-       if (err < 0)
-               return err;
-
        err = of_pci_parse_bus_range(np, &pcie->busn);
        if (err < 0) {
                dev_err(pcie->dev, "failed to parse ranges property: %d\n",
@@ -2078,6 +2048,7 @@ static const struct tegra_pcie_soc_data tegra20_pcie_data = {
        .msi_base_shift = 0,
        .pads_pll_ctl = PADS_PLL_CTL_TEGRA20,
        .tx_ref_sel = PADS_PLL_CTL_TXCLKREF_DIV10,
+       .pads_refclk_cfg0 = 0xfa5cfa5c,
        .has_pex_clkreq_en = false,
        .has_pex_bias_ctrl = false,
        .has_intr_prsnt_sense = false,
@@ -2090,6 +2061,8 @@ static const struct tegra_pcie_soc_data tegra30_pcie_data = {
        .msi_base_shift = 8,
        .pads_pll_ctl = PADS_PLL_CTL_TEGRA30,
        .tx_ref_sel = PADS_PLL_CTL_TXCLKREF_BUF_EN,
+       .pads_refclk_cfg0 = 0xfa5cfa5c,
+       .pads_refclk_cfg1 = 0xfa5cfa5c,
        .has_pex_clkreq_en = true,
        .has_pex_bias_ctrl = true,
        .has_intr_prsnt_sense = true,
@@ -2102,6 +2075,7 @@ static const struct tegra_pcie_soc_data tegra124_pcie_data = {
        .msi_base_shift = 8,
        .pads_pll_ctl = PADS_PLL_CTL_TEGRA30,
        .tx_ref_sel = PADS_PLL_CTL_TXCLKREF_BUF_EN,
+       .pads_refclk_cfg0 = 0x44ac44ac,
        .has_pex_clkreq_en = true,
        .has_pex_bias_ctrl = true,
        .has_intr_prsnt_sense = true,
@@ -2115,7 +2089,6 @@ static const struct of_device_id tegra_pcie_of_match[] = {
        { .compatible = "nvidia,tegra20-pcie", .data = &tegra20_pcie_data },
        { },
 };
-MODULE_DEVICE_TABLE(of, tegra_pcie_of_match);
 
 static void *tegra_pcie_ports_seq_start(struct seq_file *s, loff_t *pos)
 {
@@ -2249,8 +2222,6 @@ static int tegra_pcie_probe(struct platform_device *pdev)
        if (err < 0)
                return err;
 
-       pcibios_min_mem = 0;
-
        err = tegra_pcie_get_resources(pcie);
        if (err < 0) {
                dev_err(&pdev->dev, "failed to request resources: %d\n", err);
@@ -2306,8 +2277,4 @@ static struct platform_driver tegra_pcie_driver = {
        },
        .probe = tegra_pcie_probe,
 };
-module_platform_driver(tegra_pcie_driver);
-
-MODULE_AUTHOR("Thierry Reding <treding@nvidia.com>");
-MODULE_DESCRIPTION("NVIDIA Tegra PCIe driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(tegra_pcie_driver);
index 540d030..d50a3dc 100644 (file)
@@ -7,14 +7,13 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/of_pci.h>
 #include <linux/of.h>
+#include <linux/pci-ecam.h>
 #include <linux/platform_device.h>
 
-#include "../ecam.h"
-
 static void set_val(u32 v, int where, int size, u32 *val)
 {
        int shift = (where & 3) * 8;
@@ -360,7 +359,6 @@ static const struct of_device_id thunder_ecam_of_match[] = {
        { .compatible = "cavium,pci-host-thunder-ecam" },
        { },
 };
-MODULE_DEVICE_TABLE(of, thunder_ecam_of_match);
 
 static int thunder_ecam_probe(struct platform_device *pdev)
 {
@@ -374,7 +372,4 @@ static struct platform_driver thunder_ecam_driver = {
        },
        .probe = thunder_ecam_probe,
 };
-module_platform_driver(thunder_ecam_driver);
-
-MODULE_DESCRIPTION("Thunder ECAM PCI host driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(thunder_ecam_driver);
index 9b8ab94..6abaf80 100644 (file)
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/of_address.h>
 #include <linux/of_pci.h>
+#include <linux/pci-ecam.h>
 #include <linux/platform_device.h>
 
-#include "../ecam.h"
-
 #define PEM_CFG_WR 0x28
 #define PEM_CFG_RD 0x30
 
@@ -285,8 +284,9 @@ static int thunder_pem_config_write(struct pci_bus *bus, unsigned int devfn,
        return pci_generic_config_write(bus, devfn, where, size, val);
 }
 
-static int thunder_pem_init(struct device *dev, struct pci_config_window *cfg)
+static int thunder_pem_init(struct pci_config_window *cfg)
 {
+       struct device *dev = cfg->parent;
        resource_size_t bar4_start;
        struct resource *res_pem;
        struct thunder_pem_pci *pem_pci;
@@ -346,7 +346,6 @@ static const struct of_device_id thunder_pem_of_match[] = {
        { .compatible = "cavium,pci-host-thunder-pem" },
        { },
 };
-MODULE_DEVICE_TABLE(of, thunder_pem_of_match);
 
 static int thunder_pem_probe(struct platform_device *pdev)
 {
@@ -360,7 +359,4 @@ static struct platform_driver thunder_pem_driver = {
        },
        .probe = thunder_pem_probe,
 };
-module_platform_driver(thunder_pem_driver);
-
-MODULE_DESCRIPTION("Thunder PEM PCIe host driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(thunder_pem_driver);
index f843a72..f234405 100644 (file)
@@ -80,21 +80,21 @@ static int versatile_pci_parse_request_of_pci_ranges(struct device *dev,
        if (err)
                return err;
 
+       err = devm_request_pci_bus_resources(dev, res);
+       if (err)
+               goto out_release_res;
+
        resource_list_for_each_entry(win, res) {
-               struct resource *parent, *res = win->res;
+               struct resource *res = win->res;
 
                switch (resource_type(res)) {
                case IORESOURCE_IO:
-                       parent = &ioport_resource;
                        err = pci_remap_iospace(res, iobase);
-                       if (err) {
+                       if (err)
                                dev_warn(dev, "error %d: failed to map resource %pR\n",
                                         err, res);
-                               continue;
-                       }
                        break;
                case IORESOURCE_MEM:
-                       parent = &iomem_resource;
                        res_valid |= !(res->flags & IORESOURCE_PREFETCH);
 
                        writel(res->start >> 28, PCI_IMAP(mem));
@@ -102,23 +102,14 @@ static int versatile_pci_parse_request_of_pci_ranges(struct device *dev,
                        mem++;
 
                        break;
-               case IORESOURCE_BUS:
-               default:
-                       continue;
                }
-
-               err = devm_request_resource(dev, parent, res);
-               if (err)
-                       goto out_release_res;
        }
 
-       if (!res_valid) {
-               dev_err(dev, "non-prefetchable memory resource required\n");
-               err = -EINVAL;
-               goto out_release_res;
-       }
+       if (res_valid)
+               return 0;
 
-       return 0;
+       dev_err(dev, "non-prefetchable memory resource required\n");
+       err = -EINVAL;
 
 out_release_res:
        pci_free_resource_list(res);
index ae00ce2..a81273c 100644 (file)
@@ -21,7 +21,7 @@
 #include <linux/io.h>
 #include <linux/jiffies.h>
 #include <linux/memblock.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
@@ -540,14 +540,20 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev)
        if (ret)
                return ret;
 
+       ret = devm_request_pci_bus_resources(&pdev->dev, &res);
+       if (ret)
+               goto error;
+
        ret = xgene_pcie_setup(port, &res, iobase);
        if (ret)
-               return ret;
+               goto error;
 
        bus = pci_create_root_bus(&pdev->dev, 0,
                                        &xgene_pcie_ops, port, &res);
-       if (!bus)
-               return -ENOMEM;
+       if (!bus) {
+               ret = -ENOMEM;
+               goto error;
+       }
 
        pci_scan_child_bus(bus);
        pci_assign_unassigned_bus_resources(bus);
@@ -555,6 +561,10 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, port);
        return 0;
+
+error:
+       pci_free_resource_list(&res);
+       return ret;
 }
 
 static const struct of_device_id xgene_pcie_match_table[] = {
@@ -569,8 +579,4 @@ static struct platform_driver xgene_pcie_driver = {
        },
        .probe = xgene_pcie_probe_bridge,
 };
-module_platform_driver(xgene_pcie_driver);
-
-MODULE_AUTHOR("Tanmay Inamdar <tinamdar@apm.com>");
-MODULE_DESCRIPTION("APM X-Gene PCIe driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(xgene_pcie_driver);
index dbac6fb..2b78376 100644 (file)
@@ -61,6 +61,8 @@
 #define TLP_LOOP                       500
 #define RP_DEVFN                       0
 
+#define LINK_UP_TIMEOUT                        5000
+
 #define INTX_NUM                       4
 
 #define DWORD_MASK                     3
@@ -81,9 +83,30 @@ struct tlp_rp_regpair_t {
        u32 reg1;
 };
 
+static inline void cra_writel(struct altera_pcie *pcie, const u32 value,
+                             const u32 reg)
+{
+       writel_relaxed(value, pcie->cra_base + reg);
+}
+
+static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg)
+{
+       return readl_relaxed(pcie->cra_base + reg);
+}
+
+static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
+{
+       return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
+}
+
 static void altera_pcie_retrain(struct pci_dev *dev)
 {
        u16 linkcap, linkstat;
+       struct altera_pcie *pcie = dev->bus->sysdata;
+       int timeout =  0;
+
+       if (!altera_pcie_link_is_up(pcie))
+               return;
 
        /*
         * Set the retrain bit if the PCIe rootport support > 2.5GB/s, but
@@ -95,9 +118,16 @@ static void altera_pcie_retrain(struct pci_dev *dev)
                return;
 
        pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &linkstat);
-       if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB)
+       if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
                pcie_capability_set_word(dev, PCI_EXP_LNKCTL,
                                         PCI_EXP_LNKCTL_RL);
+               while (!altera_pcie_link_is_up(pcie)) {
+                       timeout++;
+                       if (timeout > LINK_UP_TIMEOUT)
+                               break;
+                       udelay(5);
+               }
+       }
 }
 DECLARE_PCI_FIXUP_EARLY(0x1172, PCI_ANY_ID, altera_pcie_retrain);
 
@@ -120,17 +150,6 @@ static bool altera_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int  devfn,
        return false;
 }
 
-static inline void cra_writel(struct altera_pcie *pcie, const u32 value,
-                             const u32 reg)
-{
-       writel_relaxed(value, pcie->cra_base + reg);
-}
-
-static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg)
-{
-       return readl_relaxed(pcie->cra_base + reg);
-}
-
 static void tlp_write_tx(struct altera_pcie *pcie,
                         struct tlp_rp_regpair_t *tlp_rp_regdata)
 {
@@ -139,11 +158,6 @@ static void tlp_write_tx(struct altera_pcie *pcie,
        cra_writel(pcie, tlp_rp_regdata->ctrl, RP_TX_CNTRL);
 }
 
-static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
-{
-       return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
-}
-
 static bool altera_pcie_valid_config(struct altera_pcie *pcie,
                                     struct pci_bus *bus, int dev)
 {
@@ -415,11 +429,6 @@ static void altera_pcie_isr(struct irq_desc *desc)
        chained_irq_exit(chip, desc);
 }
 
-static void altera_pcie_release_of_pci_ranges(struct altera_pcie *pcie)
-{
-       pci_free_resource_list(&pcie->resources);
-}
-
 static int altera_pcie_parse_request_of_pci_ranges(struct altera_pcie *pcie)
 {
        int err, res_valid = 0;
@@ -432,33 +441,25 @@ static int altera_pcie_parse_request_of_pci_ranges(struct altera_pcie *pcie)
        if (err)
                return err;
 
+       err = devm_request_pci_bus_resources(dev, &pcie->resources);
+       if (err)
+               goto out_release_res;
+
        resource_list_for_each_entry(win, &pcie->resources) {
-               struct resource *parent, *res = win->res;
+               struct resource *res = win->res;
 
-               switch (resource_type(res)) {
-               case IORESOURCE_MEM:
-                       parent = &iomem_resource;
+               if (resource_type(res) == IORESOURCE_MEM)
                        res_valid |= !(res->flags & IORESOURCE_PREFETCH);
-                       break;
-               default:
-                       continue;
-               }
-
-               err = devm_request_resource(dev, parent, res);
-               if (err)
-                       goto out_release_res;
        }
 
-       if (!res_valid) {
-               dev_err(dev, "non-prefetchable memory resource required\n");
-               err = -EINVAL;
-               goto out_release_res;
-       }
+       if (res_valid)
+               return 0;
 
-       return 0;
+       dev_err(dev, "non-prefetchable memory resource required\n");
+       err = -EINVAL;
 
 out_release_res:
-       altera_pcie_release_of_pci_ranges(pcie);
+       pci_free_resource_list(&pcie->resources);
        return err;
 }
 
index 5572356..0f4f570 100644 (file)
@@ -5,6 +5,9 @@
  *
  * Copyright (C) 2016 Marvell Technology Group Ltd.
  *
+ * Author: Yehuda Yitshak <yehuday@marvell.com>
+ * Author: Shadi Ammouri <shadi@marvell.com>
+ *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
  * warranty of any kind, whether express or implied.
@@ -14,7 +17,7 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/of.h>
 #include <linux/pci.h>
 #include <linux/phy/phy.h>
@@ -244,7 +247,6 @@ static const struct of_device_id armada8k_pcie_of_match[] = {
        { .compatible = "marvell,armada8k-pcie", },
        {},
 };
-MODULE_DEVICE_TABLE(of, armada8k_pcie_of_match);
 
 static struct platform_driver armada8k_pcie_driver = {
        .probe          = armada8k_pcie_probe,
@@ -253,10 +255,4 @@ static struct platform_driver armada8k_pcie_driver = {
                .of_match_table = of_match_ptr(armada8k_pcie_of_match),
        },
 };
-
-module_platform_driver(armada8k_pcie_driver);
-
-MODULE_DESCRIPTION("Armada 8k PCIe host controller driver");
-MODULE_AUTHOR("Yehuda Yitshak <yehuday@marvell.com>");
-MODULE_AUTHOR("Shadi Ammouri <shadi@marvell.com>");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(armada8k_pcie_driver);
diff --git a/drivers/pci/host/pcie-artpec6.c b/drivers/pci/host/pcie-artpec6.c
new file mode 100644 (file)
index 0000000..16ba70b
--- /dev/null
@@ -0,0 +1,280 @@
+/*
+ * PCIe host controller driver for Axis ARTPEC-6 SoC
+ *
+ * Author: Niklas Cassel <niklas.cassel@axis.com>
+ *
+ * Based on work done by Phil Edworthy <phil@edworthys.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/resource.h>
+#include <linux/signal.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
+
+#include "pcie-designware.h"
+
+#define to_artpec6_pcie(x)     container_of(x, struct artpec6_pcie, pp)
+
+struct artpec6_pcie {
+       struct pcie_port        pp;
+       struct regmap           *regmap;
+       void __iomem            *phy_base;
+};
+
+/* PCIe Port Logic registers (memory-mapped) */
+#define PL_OFFSET                      0x700
+#define PCIE_PHY_DEBUG_R0              (PL_OFFSET + 0x28)
+#define PCIE_PHY_DEBUG_R1              (PL_OFFSET + 0x2c)
+
+#define MISC_CONTROL_1_OFF             (PL_OFFSET + 0x1bc)
+#define  DBI_RO_WR_EN                  1
+
+/* ARTPEC-6 specific registers */
+#define PCIECFG                                0x18
+#define  PCIECFG_DBG_OEN               (1 << 24)
+#define  PCIECFG_CORE_RESET_REQ                (1 << 21)
+#define  PCIECFG_LTSSM_ENABLE          (1 << 20)
+#define  PCIECFG_CLKREQ_B              (1 << 11)
+#define  PCIECFG_REFCLK_ENABLE         (1 << 10)
+#define  PCIECFG_PLL_ENABLE            (1 << 9)
+#define  PCIECFG_PCLK_ENABLE           (1 << 8)
+#define  PCIECFG_RISRCREN              (1 << 4)
+#define  PCIECFG_MODE_TX_DRV_EN                (1 << 3)
+#define  PCIECFG_CISRREN               (1 << 2)
+#define  PCIECFG_MACRO_ENABLE          (1 << 0)
+
+#define NOCCFG                         0x40
+#define NOCCFG_ENABLE_CLK_PCIE         (1 << 4)
+#define NOCCFG_POWER_PCIE_IDLEACK      (1 << 3)
+#define NOCCFG_POWER_PCIE_IDLE         (1 << 2)
+#define NOCCFG_POWER_PCIE_IDLEREQ      (1 << 1)
+
+#define PHY_STATUS                     0x118
+#define PHY_COSPLLLOCK                 (1 << 0)
+
+#define ARTPEC6_CPU_TO_BUS_ADDR                0x0fffffff
+
+static int artpec6_pcie_establish_link(struct pcie_port *pp)
+{
+       struct artpec6_pcie *artpec6_pcie = to_artpec6_pcie(pp);
+       u32 val;
+       unsigned int retries;
+
+       /* Hold DW core in reset */
+       regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+       val |= PCIECFG_CORE_RESET_REQ;
+       regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+
+       regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+       val |=  PCIECFG_RISRCREN |      /* Receiver term. 50 Ohm */
+               PCIECFG_MODE_TX_DRV_EN |
+               PCIECFG_CISRREN |       /* Reference clock term. 100 Ohm */
+               PCIECFG_MACRO_ENABLE;
+       val |= PCIECFG_REFCLK_ENABLE;
+       val &= ~PCIECFG_DBG_OEN;
+       val &= ~PCIECFG_CLKREQ_B;
+       regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+       usleep_range(5000, 6000);
+
+       regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+       val |= NOCCFG_ENABLE_CLK_PCIE;
+       regmap_write(artpec6_pcie->regmap, NOCCFG, val);
+       usleep_range(20, 30);
+
+       regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+       val |= PCIECFG_PCLK_ENABLE | PCIECFG_PLL_ENABLE;
+       regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+       usleep_range(6000, 7000);
+
+       regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+       val &= ~NOCCFG_POWER_PCIE_IDLEREQ;
+       regmap_write(artpec6_pcie->regmap, NOCCFG, val);
+
+       retries = 50;
+       do {
+               usleep_range(1000, 2000);
+               regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+               retries--;
+       } while (retries &&
+               (val & (NOCCFG_POWER_PCIE_IDLEACK | NOCCFG_POWER_PCIE_IDLE)));
+
+       retries = 50;
+       do {
+               usleep_range(1000, 2000);
+               val = readl(artpec6_pcie->phy_base + PHY_STATUS);
+               retries--;
+       } while (retries && !(val & PHY_COSPLLLOCK));
+
+       /* Take DW core out of reset */
+       regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+       val &= ~PCIECFG_CORE_RESET_REQ;
+       regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+       usleep_range(100, 200);
+
+       /*
+        * Enable writing to config regs. This is required as the Synopsys
+        * driver changes the class code. That register needs DBI write enable.
+        */
+       writel(DBI_RO_WR_EN, pp->dbi_base + MISC_CONTROL_1_OFF);
+
+       pp->io_base &= ARTPEC6_CPU_TO_BUS_ADDR;
+       pp->mem_base &= ARTPEC6_CPU_TO_BUS_ADDR;
+       pp->cfg0_base &= ARTPEC6_CPU_TO_BUS_ADDR;
+       pp->cfg1_base &= ARTPEC6_CPU_TO_BUS_ADDR;
+
+       /* setup root complex */
+       dw_pcie_setup_rc(pp);
+
+       /* assert LTSSM enable */
+       regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+       val |= PCIECFG_LTSSM_ENABLE;
+       regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+
+       /* check if the link is up or not */
+       if (!dw_pcie_wait_for_link(pp))
+               return 0;
+
+       dev_dbg(pp->dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n",
+               readl(pp->dbi_base + PCIE_PHY_DEBUG_R0),
+               readl(pp->dbi_base + PCIE_PHY_DEBUG_R1));
+
+       return -ETIMEDOUT;
+}
+
+static void artpec6_pcie_enable_interrupts(struct pcie_port *pp)
+{
+       if (IS_ENABLED(CONFIG_PCI_MSI))
+               dw_pcie_msi_init(pp);
+}
+
+static void artpec6_pcie_host_init(struct pcie_port *pp)
+{
+       artpec6_pcie_establish_link(pp);
+       artpec6_pcie_enable_interrupts(pp);
+}
+
+static int artpec6_pcie_link_up(struct pcie_port *pp)
+{
+       u32 rc;
+
+       /*
+        * Get status from Synopsys IP
+        * link is debug bit 36, debug register 1 starts at bit 32
+        */
+       rc = readl(pp->dbi_base + PCIE_PHY_DEBUG_R1) & (0x1 << (36 - 32));
+       if (rc)
+               return 1;
+
+       return 0;
+}
+
+static struct pcie_host_ops artpec6_pcie_host_ops = {
+       .link_up = artpec6_pcie_link_up,
+       .host_init = artpec6_pcie_host_init,
+};
+
+static irqreturn_t artpec6_pcie_msi_handler(int irq, void *arg)
+{
+       struct pcie_port *pp = arg;
+
+       return dw_handle_msi_irq(pp);
+}
+
+static int __init artpec6_add_pcie_port(struct pcie_port *pp,
+                                       struct platform_device *pdev)
+{
+       int ret;
+
+       if (IS_ENABLED(CONFIG_PCI_MSI)) {
+               pp->msi_irq = platform_get_irq_byname(pdev, "msi");
+               if (pp->msi_irq <= 0) {
+                       dev_err(&pdev->dev, "failed to get MSI irq\n");
+                       return -ENODEV;
+               }
+
+               ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+                                      artpec6_pcie_msi_handler,
+                                      IRQF_SHARED | IRQF_NO_THREAD,
+                                      "artpec6-pcie-msi", pp);
+               if (ret) {
+                       dev_err(&pdev->dev, "failed to request MSI irq\n");
+                       return ret;
+               }
+       }
+
+       pp->root_bus_nr = -1;
+       pp->ops = &artpec6_pcie_host_ops;
+
+       ret = dw_pcie_host_init(pp);
+       if (ret) {
+               dev_err(&pdev->dev, "failed to initialize host\n");
+               return ret;
+       }
+
+       return 0;
+}
+
+static int artpec6_pcie_probe(struct platform_device *pdev)
+{
+       struct artpec6_pcie *artpec6_pcie;
+       struct pcie_port *pp;
+       struct resource *dbi_base;
+       struct resource *phy_base;
+       int ret;
+
+       artpec6_pcie = devm_kzalloc(&pdev->dev, sizeof(*artpec6_pcie),
+                                   GFP_KERNEL);
+       if (!artpec6_pcie)
+               return -ENOMEM;
+
+       pp = &artpec6_pcie->pp;
+       pp->dev = &pdev->dev;
+
+       dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
+       pp->dbi_base = devm_ioremap_resource(&pdev->dev, dbi_base);
+       if (IS_ERR(pp->dbi_base))
+               return PTR_ERR(pp->dbi_base);
+
+       phy_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "phy");
+       artpec6_pcie->phy_base = devm_ioremap_resource(&pdev->dev, phy_base);
+       if (IS_ERR(artpec6_pcie->phy_base))
+               return PTR_ERR(artpec6_pcie->phy_base);
+
+       artpec6_pcie->regmap =
+               syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+                                               "axis,syscon-pcie");
+       if (IS_ERR(artpec6_pcie->regmap))
+               return PTR_ERR(artpec6_pcie->regmap);
+
+       ret = artpec6_add_pcie_port(pp, pdev);
+       if (ret < 0)
+               return ret;
+
+       platform_set_drvdata(pdev, artpec6_pcie);
+       return 0;
+}
+
+static const struct of_device_id artpec6_pcie_of_match[] = {
+       { .compatible = "axis,artpec6-pcie", },
+       {},
+};
+
+static struct platform_driver artpec6_pcie_driver = {
+       .probe = artpec6_pcie_probe,
+       .driver = {
+               .name   = "artpec6-pcie",
+               .of_match_table = artpec6_pcie_of_match,
+       },
+};
+builtin_platform_driver(artpec6_pcie_driver);
index b350099..c8079dc 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/gpio.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/of_gpio.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
@@ -121,7 +121,6 @@ static const struct of_device_id dw_plat_pcie_of_match[] = {
        { .compatible = "snps,dw-pcie", },
        {},
 };
-MODULE_DEVICE_TABLE(of, dw_plat_pcie_of_match);
 
 static struct platform_driver dw_plat_pcie_driver = {
        .driver = {
@@ -130,9 +129,4 @@ static struct platform_driver dw_plat_pcie_driver = {
        },
        .probe = dw_plat_pcie_probe,
 };
-
-module_platform_driver(dw_plat_pcie_driver);
-
-MODULE_AUTHOR("Joao Pinto <Joao.Pinto@synopsys.com>");
-MODULE_DESCRIPTION("Synopsys PCIe host controller glue platform driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(dw_plat_pcie_driver);
index aafd766..12afce1 100644 (file)
@@ -452,6 +452,10 @@ int dw_pcie_host_init(struct pcie_port *pp)
        if (ret)
                return ret;
 
+       ret = devm_request_pci_bus_resources(&pdev->dev, &res);
+       if (ret)
+               goto error;
+
        /* Get the I/O and memory ranges from DT */
        resource_list_for_each_entry(win, &res) {
                switch (resource_type(win->res)) {
@@ -461,11 +465,9 @@ int dw_pcie_host_init(struct pcie_port *pp)
                        pp->io_size = resource_size(pp->io);
                        pp->io_bus_addr = pp->io->start - win->offset;
                        ret = pci_remap_iospace(pp->io, pp->io_base);
-                       if (ret) {
+                       if (ret)
                                dev_warn(pp->dev, "error %d: failed to map resource %pR\n",
                                         ret, pp->io);
-                               continue;
-                       }
                        break;
                case IORESOURCE_MEM:
                        pp->mem = win->res;
@@ -483,8 +485,6 @@ int dw_pcie_host_init(struct pcie_port *pp)
                case IORESOURCE_BUS:
                        pp->busn = win->res;
                        break;
-               default:
-                       continue;
                }
        }
 
@@ -493,7 +493,8 @@ int dw_pcie_host_init(struct pcie_port *pp)
                                        resource_size(pp->cfg));
                if (!pp->dbi_base) {
                        dev_err(pp->dev, "error with ioremap\n");
-                       return -ENOMEM;
+                       ret = -ENOMEM;
+                       goto error;
                }
        }
 
@@ -504,7 +505,8 @@ int dw_pcie_host_init(struct pcie_port *pp)
                                                pp->cfg0_size);
                if (!pp->va_cfg0_base) {
                        dev_err(pp->dev, "error with ioremap in function\n");
-                       return -ENOMEM;
+                       ret = -ENOMEM;
+                       goto error;
                }
        }
 
@@ -513,7 +515,8 @@ int dw_pcie_host_init(struct pcie_port *pp)
                                                pp->cfg1_size);
                if (!pp->va_cfg1_base) {
                        dev_err(pp->dev, "error with ioremap\n");
-                       return -ENOMEM;
+                       ret = -ENOMEM;
+                       goto error;
                }
        }
 
@@ -528,7 +531,8 @@ int dw_pcie_host_init(struct pcie_port *pp)
                                                &dw_pcie_msi_chip);
                        if (!pp->irq_domain) {
                                dev_err(pp->dev, "irq domain init failed\n");
-                               return -ENXIO;
+                               ret = -ENXIO;
+                               goto error;
                        }
 
                        for (i = 0; i < MAX_MSI_IRQS; i++)
@@ -536,7 +540,7 @@ int dw_pcie_host_init(struct pcie_port *pp)
                } else {
                        ret = pp->ops->msi_host_init(pp, &dw_pcie_msi_chip);
                        if (ret < 0)
-                               return ret;
+                               goto error;
                }
        }
 
@@ -552,8 +556,10 @@ int dw_pcie_host_init(struct pcie_port *pp)
        } else
                bus = pci_scan_root_bus(pp->dev, pp->root_bus_nr, &dw_pcie_ops,
                                        pp, &res);
-       if (!bus)
-               return -ENOMEM;
+       if (!bus) {
+               ret = -ENOMEM;
+               goto error;
+       }
 
        if (pp->ops->scan_bus)
                pp->ops->scan_bus(pp);
@@ -571,6 +577,10 @@ int dw_pcie_host_init(struct pcie_port *pp)
 
        pci_bus_add_devices(bus);
        return 0;
+
+error:
+       pci_free_resource_list(&res);
+       return ret;
 }
 
 static int dw_pcie_rd_other_conf(struct pcie_port *pp, struct pci_bus *bus,
index 3e98d4e..7ee9dfc 100644 (file)
@@ -12,7 +12,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/mfd/syscon.h>
 #include <linux/of_address.h>
 #include <linux/of_pci.h>
@@ -235,9 +235,6 @@ static const struct of_device_id hisi_pcie_of_match[] = {
        {},
 };
 
-
-MODULE_DEVICE_TABLE(of, hisi_pcie_of_match);
-
 static struct platform_driver hisi_pcie_driver = {
        .probe  = hisi_pcie_probe,
        .driver = {
@@ -245,10 +242,4 @@ static struct platform_driver hisi_pcie_driver = {
                   .of_match_table = hisi_pcie_of_match,
        },
 };
-
-module_platform_driver(hisi_pcie_driver);
-
-MODULE_AUTHOR("Zhou Wang <wangzhou1@hisilicon.com>");
-MODULE_AUTHOR("Dacai Zhu <zhudacai@hisilicon.com>");
-MODULE_AUTHOR("Gabriele Paoloni <gabriele.paoloni@huawei.com>");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(hisi_pcie_driver);
index a576aee..e167b2f 100644 (file)
@@ -462,6 +462,10 @@ int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
        if (!pcie || !pcie->dev || !pcie->base)
                return -EINVAL;
 
+       ret = devm_request_pci_bus_resources(pcie->dev, res);
+       if (ret)
+               return ret;
+
        ret = phy_init(pcie->phy);
        if (ret) {
                dev_err(pcie->dev, "unable to initialize PCIe PHY\n");
index 3509218..65db7a2 100644 (file)
@@ -7,6 +7,8 @@
  *  arch/sh/drivers/pci/ops-sh7786.c
  *  Copyright (C) 2009 - 2011  Paul Mundt
  *
+ * Author: Phil Edworthy <phil.edworthy@renesas.com>
+ *
  * This file is licensed under the terms of the GNU General Public
  * License version 2.  This program is licensed "as is" without any
  * warranty of any kind, whether express or implied.
@@ -18,7 +20,7 @@
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/msi.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
@@ -936,12 +938,6 @@ static const struct of_device_id rcar_pcie_of_match[] = {
        { .compatible = "renesas,pcie-r8a7795", .data = rcar_pcie_hw_init },
        {},
 };
-MODULE_DEVICE_TABLE(of, rcar_pcie_of_match);
-
-static void rcar_pcie_release_of_pci_ranges(struct rcar_pcie *pci)
-{
-       pci_free_resource_list(&pci->resources);
-}
 
 static int rcar_pcie_parse_request_of_pci_ranges(struct rcar_pcie *pci)
 {
@@ -955,37 +951,25 @@ static int rcar_pcie_parse_request_of_pci_ranges(struct rcar_pcie *pci)
        if (err)
                return err;
 
+       err = devm_request_pci_bus_resources(dev, &pci->resources);
+       if (err)
+               goto out_release_res;
+
        resource_list_for_each_entry(win, &pci->resources) {
-               struct resource *parent, *res = win->res;
+               struct resource *res = win->res;
 
-               switch (resource_type(res)) {
-               case IORESOURCE_IO:
-                       parent = &ioport_resource;
+               if (resource_type(res) == IORESOURCE_IO) {
                        err = pci_remap_iospace(res, iobase);
-                       if (err) {
+                       if (err)
                                dev_warn(dev, "error %d: failed to map resource %pR\n",
                                         err, res);
-                               continue;
-                       }
-                       break;
-               case IORESOURCE_MEM:
-                       parent = &iomem_resource;
-                       break;
-
-               case IORESOURCE_BUS:
-               default:
-                       continue;
                }
-
-               err = devm_request_resource(dev, parent, res);
-               if (err)
-                       goto out_release_res;
        }
 
        return 0;
 
 out_release_res:
-       rcar_pcie_release_of_pci_ranges(pci);
+       pci_free_resource_list(&pci->resources);
        return err;
 }
 
@@ -1073,8 +1057,4 @@ static struct platform_driver rcar_pcie_driver = {
        },
        .probe = rcar_pcie_probe,
 };
-module_platform_driver(rcar_pcie_driver);
-
-MODULE_AUTHOR("Phil Edworthy <phil.edworthy@renesas.com>");
-MODULE_DESCRIPTION("Renesas R-Car PCIe driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(rcar_pcie_driver);
index 3479d30..0b597d9 100644 (file)
@@ -825,27 +825,33 @@ static int nwl_pcie_probe(struct platform_device *pdev)
 
        err = of_pci_get_host_bridge_resources(node, 0, 0xff, &res, &iobase);
        if (err) {
-               pr_err("Getting bridge resources failed\n");
+               dev_err(pcie->dev, "Getting bridge resources failed\n");
                return err;
        }
 
+       err = devm_request_pci_bus_resources(pcie->dev, &res);
+       if (err)
+               goto error;
+
        err = nwl_pcie_init_irq_domain(pcie);
        if (err) {
                dev_err(pcie->dev, "Failed creating IRQ Domain\n");
-               return err;
+               goto error;
        }
 
        bus = pci_create_root_bus(&pdev->dev, pcie->root_busno,
                                  &nwl_pcie_ops, pcie, &res);
-       if (!bus)
-               return -ENOMEM;
+       if (!bus) {
+               err = -ENOMEM;
+               goto error;
+       }
 
        if (IS_ENABLED(CONFIG_PCI_MSI)) {
                err = nwl_pcie_enable_msi(pcie, bus);
                if (err < 0) {
                        dev_err(&pdev->dev,
                                "failed to enable MSI support: %d\n", err);
-                       return err;
+                       goto error;
                }
        }
        pci_scan_child_bus(bus);
@@ -855,6 +861,10 @@ static int nwl_pcie_probe(struct platform_device *pdev)
        pci_bus_add_devices(bus);
        platform_set_drvdata(pdev, pcie);
        return 0;
+
+error:
+       pci_free_resource_list(&res);
+       return err;
 }
 
 static int nwl_pcie_remove(struct platform_device *pdev)
index 65f0fe0..a30e016 100644 (file)
@@ -550,7 +550,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
        pcie_intc_node = of_get_next_child(node, NULL);
        if (!pcie_intc_node) {
                dev_err(dev, "No PCIe Intc node found\n");
-               return PTR_ERR(pcie_intc_node);
+               return -ENODEV;
        }
 
        port->irq_domain = irq_domain_add_linear(pcie_intc_node, 4,
@@ -558,7 +558,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
                                                 port);
        if (!port->irq_domain) {
                dev_err(dev, "Failed to get a INTx IRQ domain\n");
-               return PTR_ERR(port->irq_domain);
+               return -ENODEV;
        }
 
        /* Setup MSI */
@@ -569,7 +569,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
                                                         &xilinx_pcie_msi_chip);
                if (!port->irq_domain) {
                        dev_err(dev, "Failed to get a MSI IRQ domain\n");
-                       return PTR_ERR(port->irq_domain);
+                       return -ENODEV;
                }
 
                xilinx_pcie_enable_msi(port);
@@ -660,7 +660,6 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
        struct xilinx_pcie_port *port;
        struct device *dev = &pdev->dev;
        struct pci_bus *bus;
-
        int err;
        resource_size_t iobase = 0;
        LIST_HEAD(res);
@@ -694,10 +693,17 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
                dev_err(dev, "Getting bridge resources failed\n");
                return err;
        }
+
+       err = devm_request_pci_bus_resources(dev, &res);
+       if (err)
+               goto error;
+
        bus = pci_create_root_bus(&pdev->dev, 0,
                                  &xilinx_pcie_ops, port, &res);
-       if (!bus)
-               return -ENOMEM;
+       if (!bus) {
+               err = -ENOMEM;
+               goto error;
+       }
 
 #ifdef CONFIG_PCI_MSI
        xilinx_pcie_msi_chip.dev = port->dev;
@@ -712,6 +718,10 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
        platform_set_drvdata(pdev, port);
 
        return 0;
+
+error:
+       pci_free_resource_list(&res);
+       return err;
 }
 
 /**
index fa49f91..6a33ddc 100644 (file)
@@ -675,6 +675,8 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge)
        if (bridge->is_going_away)
                return;
 
+       pm_runtime_get_sync(&bridge->pci_dev->dev);
+
        list_for_each_entry(slot, &bridge->slots, node) {
                struct pci_bus *bus = slot->bus;
                struct pci_dev *dev, *tmp;
@@ -694,6 +696,8 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge)
                        disable_slot(slot);
                }
        }
+
+       pm_runtime_put(&bridge->pci_dev->dev);
 }
 
 /*
index 5c24e93..08e84d6 100644 (file)
@@ -546,6 +546,10 @@ static irqreturn_t pcie_isr(int irq, void *dev_id)
        u8 present;
        bool link;
 
+       /* Interrupts cannot originate from a controller that's asleep */
+       if (pdev->current_state == PCI_D3cold)
+               return IRQ_NONE;
+
        /*
         * In order to guarantee that all interrupt events are
         * serviced, we need to re-inspect Slot Status register after
index a080f44..a02981e 100644 (file)
@@ -4,6 +4,7 @@
  *
  * Copyright (C) 2003-2004 Intel
  * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ * Copyright (C) 2016 Christoph Hellwig.
  */
 
 #include <linux/err.h>
@@ -207,6 +208,12 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
        desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag);
 }
 
+static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
+{
+       return desc->mask_base +
+               desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+}
+
 /*
  * This internal function does not flush PCI writes to the device.
  * All users must ensure that they read from the device before either
@@ -217,8 +224,6 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
        u32 mask_bits = desc->masked;
-       unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-                                               PCI_MSIX_ENTRY_VECTOR_CTRL;
 
        if (pci_msi_ignore_mask)
                return 0;
@@ -226,7 +231,7 @@ u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
        mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
        if (flag)
                mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
-       writel(mask_bits, desc->mask_base + offset);
+       writel(mask_bits, pci_msix_desc_addr(desc) + PCI_MSIX_ENTRY_VECTOR_CTRL);
 
        return mask_bits;
 }
@@ -284,8 +289,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
        BUG_ON(dev->current_state != PCI_D0);
 
        if (entry->msi_attrib.is_msix) {
-               void __iomem *base = entry->mask_base +
-                       entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+               void __iomem *base = pci_msix_desc_addr(entry);
 
                msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
                msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
@@ -315,9 +319,7 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
        if (dev->current_state != PCI_D0) {
                /* Don't touch the hardware now */
        } else if (entry->msi_attrib.is_msix) {
-               void __iomem *base;
-               base = entry->mask_base +
-                       entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+               void __iomem *base = pci_msix_desc_addr(entry);
 
                writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
                writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
@@ -567,6 +569,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
        entry->msi_attrib.multi_cap     = (control & PCI_MSI_FLAGS_QMASK) >> 1;
        entry->msi_attrib.multiple      = ilog2(__roundup_pow_of_two(nvec));
        entry->nvec_used                = nvec;
+       entry->affinity                 = dev->irq_affinity;
 
        if (control & PCI_MSI_FLAGS_64BIT)
                entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
@@ -678,10 +681,18 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
 static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
                              struct msix_entry *entries, int nvec)
 {
+       const struct cpumask *mask = NULL;
        struct msi_desc *entry;
-       int i;
+       int cpu = -1, i;
 
        for (i = 0; i < nvec; i++) {
+               if (dev->irq_affinity) {
+                       cpu = cpumask_next(cpu, dev->irq_affinity);
+                       if (cpu >= nr_cpu_ids)
+                               cpu = cpumask_first(dev->irq_affinity);
+                       mask = cpumask_of(cpu);
+               }
+
                entry = alloc_msi_entry(&dev->dev);
                if (!entry) {
                        if (!i)
@@ -694,10 +705,14 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 
                entry->msi_attrib.is_msix       = 1;
                entry->msi_attrib.is_64         = 1;
-               entry->msi_attrib.entry_nr      = entries[i].entry;
+               if (entries)
+                       entry->msi_attrib.entry_nr = entries[i].entry;
+               else
+                       entry->msi_attrib.entry_nr = i;
                entry->msi_attrib.default_irq   = dev->irq;
                entry->mask_base                = base;
                entry->nvec_used                = 1;
+               entry->affinity                 = mask;
 
                list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
        }
@@ -712,13 +727,11 @@ static void msix_program_entries(struct pci_dev *dev,
        int i = 0;
 
        for_each_pci_msi_entry(entry, dev) {
-               int offset = entries[i].entry * PCI_MSIX_ENTRY_SIZE +
-                                               PCI_MSIX_ENTRY_VECTOR_CTRL;
-
-               entries[i].vector = entry->irq;
-               entry->masked = readl(entry->mask_base + offset);
+               if (entries)
+                       entries[i++].vector = entry->irq;
+               entry->masked = readl(pci_msix_desc_addr(entry) +
+                               PCI_MSIX_ENTRY_VECTOR_CTRL);
                msix_mask_irq(entry, 1);
-               i++;
        }
 }
 
@@ -931,7 +944,7 @@ EXPORT_SYMBOL(pci_msix_vec_count);
 /**
  * pci_enable_msix - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
- * @entries: pointer to an array of MSI-X entries
+ * @entries: pointer to an array of MSI-X entries (optional)
  * @nvec: number of MSI-X irqs requested for allocation by device driver
  *
  * Setup the MSI-X capability structure of device function with the number
@@ -951,22 +964,21 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
        if (!pci_msi_supported(dev, nvec))
                return -EINVAL;
 
-       if (!entries)
-               return -EINVAL;
-
        nr_entries = pci_msix_vec_count(dev);
        if (nr_entries < 0)
                return nr_entries;
        if (nvec > nr_entries)
                return nr_entries;
 
-       /* Check for any invalid entries */
-       for (i = 0; i < nvec; i++) {
-               if (entries[i].entry >= nr_entries)
-                       return -EINVAL;         /* invalid entry */
-               for (j = i + 1; j < nvec; j++) {
-                       if (entries[i].entry == entries[j].entry)
-                               return -EINVAL; /* duplicate entry */
+       if (entries) {
+               /* Check for any invalid entries */
+               for (i = 0; i < nvec; i++) {
+                       if (entries[i].entry >= nr_entries)
+                               return -EINVAL;         /* invalid entry */
+                       for (j = i + 1; j < nvec; j++) {
+                               if (entries[i].entry == entries[j].entry)
+                                       return -EINVAL; /* duplicate entry */
+                       }
                }
        }
        WARN_ON(!!dev->msix_enabled);
@@ -1026,19 +1038,8 @@ int pci_msi_enabled(void)
 }
 EXPORT_SYMBOL(pci_msi_enabled);
 
-/**
- * pci_enable_msi_range - configure device's MSI capability structure
- * @dev: device to configure
- * @minvec: minimal number of interrupts to configure
- * @maxvec: maximum number of interrupts to configure
- *
- * This function tries to allocate a maximum possible number of interrupts in a
- * range between @minvec and @maxvec. It returns a negative errno if an error
- * occurs. If it succeeds, it returns the actual number of interrupts allocated
- * and updates the @dev's irq member to the lowest new interrupt number;
- * the other interrupt numbers allocated to this device are consecutive.
- **/
-int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
+               unsigned int flags)
 {
        int nvec;
        int rc;
@@ -1061,25 +1062,85 @@ int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
        nvec = pci_msi_vec_count(dev);
        if (nvec < 0)
                return nvec;
-       else if (nvec < minvec)
+       if (nvec < minvec)
                return -EINVAL;
-       else if (nvec > maxvec)
+
+       if (nvec > maxvec)
                nvec = maxvec;
 
-       do {
+       for (;;) {
+               if (!(flags & PCI_IRQ_NOAFFINITY)) {
+                       dev->irq_affinity = irq_create_affinity_mask(&nvec);
+                       if (nvec < minvec)
+                               return -ENOSPC;
+               }
+
                rc = msi_capability_init(dev, nvec);
-               if (rc < 0) {
+               if (rc == 0)
+                       return nvec;
+
+               kfree(dev->irq_affinity);
+               dev->irq_affinity = NULL;
+
+               if (rc < 0)
                        return rc;
-               } else if (rc > 0) {
-                       if (rc < minvec)
+               if (rc < minvec)
+                       return -ENOSPC;
+
+               nvec = rc;
+       }
+}
+
+/**
+ * pci_enable_msi_range - configure device's MSI capability structure
+ * @dev: device to configure
+ * @minvec: minimal number of interrupts to configure
+ * @maxvec: maximum number of interrupts to configure
+ *
+ * This function tries to allocate a maximum possible number of interrupts in a
+ * range between @minvec and @maxvec. It returns a negative errno if an error
+ * occurs. If it succeeds, it returns the actual number of interrupts allocated
+ * and updates the @dev's irq member to the lowest new interrupt number;
+ * the other interrupt numbers allocated to this device are consecutive.
+ **/
+int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+{
+       return __pci_enable_msi_range(dev, minvec, maxvec, PCI_IRQ_NOAFFINITY);
+}
+EXPORT_SYMBOL(pci_enable_msi_range);
+
+static int __pci_enable_msix_range(struct pci_dev *dev,
+               struct msix_entry *entries, int minvec, int maxvec,
+               unsigned int flags)
+{
+       int nvec = maxvec;
+       int rc;
+
+       if (maxvec < minvec)
+               return -ERANGE;
+
+       for (;;) {
+               if (!(flags & PCI_IRQ_NOAFFINITY)) {
+                       dev->irq_affinity = irq_create_affinity_mask(&nvec);
+                       if (nvec < minvec)
                                return -ENOSPC;
-                       nvec = rc;
                }
-       } while (rc);
 
-       return nvec;
+               rc = pci_enable_msix(dev, entries, nvec);
+               if (rc == 0)
+                       return nvec;
+
+               kfree(dev->irq_affinity);
+               dev->irq_affinity = NULL;
+
+               if (rc < 0)
+                       return rc;
+               if (rc < minvec)
+                       return -ENOSPC;
+
+               nvec = rc;
+       }
 }
-EXPORT_SYMBOL(pci_enable_msi_range);
 
 /**
  * pci_enable_msix_range - configure device's MSI-X capability structure
@@ -1097,28 +1158,101 @@ EXPORT_SYMBOL(pci_enable_msi_range);
  * with new allocated MSI-X interrupts.
  **/
 int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
-                              int minvec, int maxvec)
+               int minvec, int maxvec)
 {
-       int nvec = maxvec;
-       int rc;
+       return __pci_enable_msix_range(dev, entries, minvec, maxvec,
+                       PCI_IRQ_NOAFFINITY);
+}
+EXPORT_SYMBOL(pci_enable_msix_range);
 
-       if (maxvec < minvec)
-               return -ERANGE;
+/**
+ * pci_alloc_irq_vectors - allocate multiple IRQs for a device
+ * @dev:               PCI device to operate on
+ * @min_vecs:          minimum number of vectors required (must be >= 1)
+ * @max_vecs:          maximum (desired) number of vectors
+ * @flags:             flags or quirks for the allocation
+ *
+ * Allocate up to @max_vecs interrupt vectors for @dev, using MSI-X or MSI
+ * vectors if available, and fall back to a single legacy vector
+ * if neither is available.  Return the number of vectors allocated,
+ * (which might be smaller than @max_vecs) if successful, or a negative
+ * error code on error. If less than @min_vecs interrupt vectors are
+ * available for @dev the function will fail with -ENOSPC.
+ *
+ * To get the Linux IRQ number used for a vector that can be passed to
+ * request_irq() use the pci_irq_vector() helper.
+ */
+int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+               unsigned int max_vecs, unsigned int flags)
+{
+       int vecs = -ENOSPC;
 
-       do {
-               rc = pci_enable_msix(dev, entries, nvec);
-               if (rc < 0) {
-                       return rc;
-               } else if (rc > 0) {
-                       if (rc < minvec)
-                               return -ENOSPC;
-                       nvec = rc;
+       if (!(flags & PCI_IRQ_NOMSIX)) {
+               vecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
+                               flags);
+               if (vecs > 0)
+                       return vecs;
+       }
+
+       if (!(flags & PCI_IRQ_NOMSI)) {
+               vecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, flags);
+               if (vecs > 0)
+                       return vecs;
+       }
+
+       /* use legacy irq if allowed */
+       if (!(flags & PCI_IRQ_NOLEGACY) && min_vecs == 1)
+               return 1;
+       return vecs;
+}
+EXPORT_SYMBOL(pci_alloc_irq_vectors);
+
+/**
+ * pci_free_irq_vectors - free previously allocated IRQs for a device
+ * @dev:               PCI device to operate on
+ *
+ * Undoes the allocations and enabling in pci_alloc_irq_vectors().
+ */
+void pci_free_irq_vectors(struct pci_dev *dev)
+{
+       pci_disable_msix(dev);
+       pci_disable_msi(dev);
+}
+EXPORT_SYMBOL(pci_free_irq_vectors);
+
+/**
+ * pci_irq_vector - return Linux IRQ number of a device vector
+ * @dev: PCI device to operate on
+ * @nr: device-relative interrupt vector index (0-based).
+ */
+int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
+{
+       if (dev->msix_enabled) {
+               struct msi_desc *entry;
+               int i = 0;
+
+               for_each_pci_msi_entry(entry, dev) {
+                       if (i == nr)
+                               return entry->irq;
+                       i++;
                }
-       } while (rc);
+               WARN_ON_ONCE(1);
+               return -EINVAL;
+       }
 
-       return nvec;
+       if (dev->msi_enabled) {
+               struct msi_desc *entry = first_pci_msi_entry(dev);
+
+               if (WARN_ON_ONCE(nr >= entry->nvec_used))
+                       return -EINVAL;
+       } else {
+               if (WARN_ON_ONCE(nr > 0))
+                       return -EINVAL;
+       }
+
+       return dev->irq + nr;
 }
-EXPORT_SYMBOL(pci_enable_msix_range);
+EXPORT_SYMBOL(pci_irq_vector);
 
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc)
 {
index d7ffd66..e39a67c 100644 (file)
@@ -777,7 +777,7 @@ static int pci_pm_suspend_noirq(struct device *dev)
 
        if (!pci_dev->state_saved) {
                pci_save_state(pci_dev);
-               if (!pci_has_subordinate(pci_dev))
+               if (pci_power_manageable(pci_dev))
                        pci_prepare_to_sleep(pci_dev);
        }
 
@@ -1144,7 +1144,6 @@ static int pci_pm_runtime_suspend(struct device *dev)
                return -ENOSYS;
 
        pci_dev->state_saved = false;
-       pci_dev->no_d3cold = false;
        error = pm->runtime_suspend(dev);
        if (error) {
                /*
@@ -1161,8 +1160,6 @@ static int pci_pm_runtime_suspend(struct device *dev)
 
                return error;
        }
-       if (!pci_dev->d3cold_allowed)
-               pci_dev->no_d3cold = true;
 
        pci_fixup_device(pci_fixup_suspend, pci_dev);
 
index d319a9c..bcd10c7 100644 (file)
@@ -406,6 +406,11 @@ static ssize_t d3cold_allowed_store(struct device *dev,
                return -EINVAL;
 
        pdev->d3cold_allowed = !!val;
+       if (pdev->d3cold_allowed)
+               pci_d3cold_enable(pdev);
+       else
+               pci_d3cold_disable(pdev);
+
        pm_runtime_resume(dev);
 
        return count;
index badbddc..aab9d51 100644 (file)
@@ -7,8 +7,10 @@
  *     Copyright 1997 -- 2000 Martin Mares <mj@ucw.cz>
  */
 
+#include <linux/acpi.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
+#include <linux/dmi.h>
 #include <linux/init.h>
 #include <linux/of.h>
 #include <linux/of_pci.h>
@@ -25,7 +27,9 @@
 #include <linux/device.h>
 #include <linux/pm_runtime.h>
 #include <linux/pci_hotplug.h>
+#include <linux/vmalloc.h>
 #include <asm/setup.h>
+#include <asm/dma.h>
 #include <linux/aer.h>
 #include "pci.h"
 
@@ -81,6 +85,9 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
 unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
 unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
 
+#define DEFAULT_HOTPLUG_BUS_SIZE       1
+unsigned long pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE;
+
 enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_DEFAULT;
 
 /*
@@ -101,6 +108,21 @@ unsigned int pcibios_max_latency = 255;
 /* If set, the PCIe ARI capability will not be used. */
 static bool pcie_ari_disabled;
 
+/* Disable bridge_d3 for all PCIe ports */
+static bool pci_bridge_d3_disable;
+/* Force bridge_d3 for all PCIe ports */
+static bool pci_bridge_d3_force;
+
+static int __init pcie_port_pm_setup(char *str)
+{
+       if (!strcmp(str, "off"))
+               pci_bridge_d3_disable = true;
+       else if (!strcmp(str, "force"))
+               pci_bridge_d3_force = true;
+       return 1;
+}
+__setup("pcie_port_pm=", pcie_port_pm_setup);
+
 /**
  * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children
  * @bus: pointer to PCI bus structure to search
@@ -2155,6 +2177,164 @@ void pci_config_pm_runtime_put(struct pci_dev *pdev)
                pm_runtime_put_sync(parent);
 }
 
+/**
+ * pci_bridge_d3_possible - Is it possible to put the bridge into D3
+ * @bridge: Bridge to check
+ *
+ * This function checks if it is possible to move the bridge to D3.
+ * Currently we only allow D3 for recent enough PCIe ports.
+ */
+static bool pci_bridge_d3_possible(struct pci_dev *bridge)
+{
+       unsigned int year;
+
+       if (!pci_is_pcie(bridge))
+               return false;
+
+       switch (pci_pcie_type(bridge)) {
+       case PCI_EXP_TYPE_ROOT_PORT:
+       case PCI_EXP_TYPE_UPSTREAM:
+       case PCI_EXP_TYPE_DOWNSTREAM:
+               if (pci_bridge_d3_disable)
+                       return false;
+               if (pci_bridge_d3_force)
+                       return true;
+
+               /*
+                * It should be safe to put PCIe ports from 2015 or newer
+                * to D3.
+                */
+               if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
+                   year >= 2015) {
+                       return true;
+               }
+               break;
+       }
+
+       return false;
+}
+
+static int pci_dev_check_d3cold(struct pci_dev *dev, void *data)
+{
+       bool *d3cold_ok = data;
+       bool no_d3cold;
+
+       /*
+        * The device needs to be allowed to go D3cold and if it is wake
+        * capable to do so from D3cold.
+        */
+       no_d3cold = dev->no_d3cold || !dev->d3cold_allowed ||
+               (device_may_wakeup(&dev->dev) && !pci_pme_capable(dev, PCI_D3cold)) ||
+               !pci_power_manageable(dev);
+
+       *d3cold_ok = !no_d3cold;
+
+       return no_d3cold;
+}
+
+/*
+ * pci_bridge_d3_update - Update bridge D3 capabilities
+ * @dev: PCI device which is changed
+ * @remove: Is the device being removed
+ *
+ * Update upstream bridge PM capabilities accordingly depending on if the
+ * device PM configuration was changed or the device is being removed.  The
+ * change is also propagated upstream.
+ */
+static void pci_bridge_d3_update(struct pci_dev *dev, bool remove)
+{
+       struct pci_dev *bridge;
+       bool d3cold_ok = true;
+
+       bridge = pci_upstream_bridge(dev);
+       if (!bridge || !pci_bridge_d3_possible(bridge))
+               return;
+
+       pci_dev_get(bridge);
+       /*
+        * If the device is removed we do not care about its D3cold
+        * capabilities.
+        */
+       if (!remove)
+               pci_dev_check_d3cold(dev, &d3cold_ok);
+
+       if (d3cold_ok) {
+               /*
+                * We need to go through all children to find out if all of
+                * them can still go to D3cold.
+                */
+               pci_walk_bus(bridge->subordinate, pci_dev_check_d3cold,
+                            &d3cold_ok);
+       }
+
+       if (bridge->bridge_d3 != d3cold_ok) {
+               bridge->bridge_d3 = d3cold_ok;
+               /* Propagate change to upstream bridges */
+               pci_bridge_d3_update(bridge, false);
+       }
+
+       pci_dev_put(bridge);
+}
+
+/**
+ * pci_bridge_d3_device_changed - Update bridge D3 capabilities on change
+ * @dev: PCI device that was changed
+ *
+ * If a device is added or its PM configuration, such as is it allowed to
+ * enter D3cold, is changed this function updates upstream bridge PM
+ * capabilities accordingly.
+ */
+void pci_bridge_d3_device_changed(struct pci_dev *dev)
+{
+       pci_bridge_d3_update(dev, false);
+}
+
+/**
+ * pci_bridge_d3_device_removed - Update bridge D3 capabilities on remove
+ * @dev: PCI device being removed
+ *
+ * Function updates upstream bridge PM capabilities based on other devices
+ * still left on the bus.
+ */
+void pci_bridge_d3_device_removed(struct pci_dev *dev)
+{
+       pci_bridge_d3_update(dev, true);
+}
+
+/**
+ * pci_d3cold_enable - Enable D3cold for device
+ * @dev: PCI device to handle
+ *
+ * This function can be used in drivers to enable D3cold from the device
+ * they handle.  It also updates upstream PCI bridge PM capabilities
+ * accordingly.
+ */
+void pci_d3cold_enable(struct pci_dev *dev)
+{
+       if (dev->no_d3cold) {
+               dev->no_d3cold = false;
+               pci_bridge_d3_device_changed(dev);
+       }
+}
+EXPORT_SYMBOL_GPL(pci_d3cold_enable);
+
+/**
+ * pci_d3cold_disable - Disable D3cold for device
+ * @dev: PCI device to handle
+ *
+ * This function can be used in drivers to disable D3cold from the device
+ * they handle.  It also updates upstream PCI bridge PM capabilities
+ * accordingly.
+ */
+void pci_d3cold_disable(struct pci_dev *dev)
+{
+       if (!dev->no_d3cold) {
+               dev->no_d3cold = true;
+               pci_bridge_d3_device_changed(dev);
+       }
+}
+EXPORT_SYMBOL_GPL(pci_d3cold_disable);
+
 /**
  * pci_pm_init - Initialize PM functions of given PCI device
  * @dev: PCI device to handle.
@@ -2189,6 +2369,7 @@ void pci_pm_init(struct pci_dev *dev)
        dev->pm_cap = pm;
        dev->d3_delay = PCI_PM_D3_WAIT;
        dev->d3cold_delay = PCI_PM_D3COLD_WAIT;
+       dev->bridge_d3 = pci_bridge_d3_possible(dev);
        dev->d3cold_allowed = true;
 
        dev->d1_support = false;
@@ -3165,6 +3346,23 @@ int __weak pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr)
 #endif
 }
 
+/**
+ *     pci_unmap_iospace - Unmap the memory mapped I/O space
+ *     @res: resource to be unmapped
+ *
+ *     Unmap the CPU virtual address @res from virtual address space.
+ *     Only architectures that have memory mapped IO functions defined
+ *     (and the PCI_IOBASE value defined) should call this function.
+ */
+void pci_unmap_iospace(struct resource *res)
+{
+#if defined(PCI_IOBASE) && defined(CONFIG_MMU)
+       unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start;
+
+       unmap_kernel_range(vaddr, resource_size(res));
+#endif
+}
+
 static void __pci_set_master(struct pci_dev *dev, bool enable)
 {
        u16 old_cmd, cmd;
@@ -4755,6 +4953,7 @@ static DEFINE_SPINLOCK(resource_alignment_lock);
 static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev)
 {
        int seg, bus, slot, func, align_order, count;
+       unsigned short vendor, device, subsystem_vendor, subsystem_device;
        resource_size_t align = 0;
        char *p;
 
@@ -4768,28 +4967,55 @@ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev)
                } else {
                        align_order = -1;
                }
-               if (sscanf(p, "%x:%x:%x.%x%n",
-                       &seg, &bus, &slot, &func, &count) != 4) {
-                       seg = 0;
-                       if (sscanf(p, "%x:%x.%x%n",
-                                       &bus, &slot, &func, &count) != 3) {
-                               /* Invalid format */
-                               printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n",
-                                       p);
+               if (strncmp(p, "pci:", 4) == 0) {
+                       /* PCI vendor/device (subvendor/subdevice) ids are specified */
+                       p += 4;
+                       if (sscanf(p, "%hx:%hx:%hx:%hx%n",
+                               &vendor, &device, &subsystem_vendor, &subsystem_device, &count) != 4) {
+                               if (sscanf(p, "%hx:%hx%n", &vendor, &device, &count) != 2) {
+                                       printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: pci:%s\n",
+                                               p);
+                                       break;
+                               }
+                               subsystem_vendor = subsystem_device = 0;
+                       }
+                       p += count;
+                       if ((!vendor || (vendor == dev->vendor)) &&
+                               (!device || (device == dev->device)) &&
+                               (!subsystem_vendor || (subsystem_vendor == dev->subsystem_vendor)) &&
+                               (!subsystem_device || (subsystem_device == dev->subsystem_device))) {
+                               if (align_order == -1)
+                                       align = PAGE_SIZE;
+                               else
+                                       align = 1 << align_order;
+                               /* Found */
                                break;
                        }
                }
-               p += count;
-               if (seg == pci_domain_nr(dev->bus) &&
-                       bus == dev->bus->number &&
-                       slot == PCI_SLOT(dev->devfn) &&
-                       func == PCI_FUNC(dev->devfn)) {
-                       if (align_order == -1)
-                               align = PAGE_SIZE;
-                       else
-                               align = 1 << align_order;
-                       /* Found */
-                       break;
+               else {
+                       if (sscanf(p, "%x:%x:%x.%x%n",
+                               &seg, &bus, &slot, &func, &count) != 4) {
+                               seg = 0;
+                               if (sscanf(p, "%x:%x.%x%n",
+                                               &bus, &slot, &func, &count) != 3) {
+                                       /* Invalid format */
+                                       printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n",
+                                               p);
+                                       break;
+                               }
+                       }
+                       p += count;
+                       if (seg == pci_domain_nr(dev->bus) &&
+                               bus == dev->bus->number &&
+                               slot == PCI_SLOT(dev->devfn) &&
+                               func == PCI_FUNC(dev->devfn)) {
+                               if (align_order == -1)
+                                       align = PAGE_SIZE;
+                               else
+                                       align = 1 << align_order;
+                               /* Found */
+                               break;
+                       }
                }
                if (*p != ';' && *p != ',') {
                        /* End of param or invalid format */
@@ -4897,7 +5123,7 @@ static ssize_t pci_resource_alignment_store(struct bus_type *bus,
        return pci_set_resource_alignment_param(buf, count);
 }
 
-BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show,
+static BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show,
                                        pci_resource_alignment_store);
 
 static int __init pci_resource_alignment_sysfs_init(void)
@@ -4923,7 +5149,7 @@ int pci_get_new_domain_nr(void)
 }
 
 #ifdef CONFIG_PCI_DOMAINS_GENERIC
-void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent)
+static int of_pci_bus_find_domain_nr(struct device *parent)
 {
        static int use_dt_domains = -1;
        int domain = -1;
@@ -4967,7 +5193,13 @@ void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent)
                domain = -1;
        }
 
-       bus->domain_nr = domain;
+       return domain;
+}
+
+int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent)
+{
+       return acpi_disabled ? of_pci_bus_find_domain_nr(parent) :
+                              acpi_pci_bus_find_domain_nr(bus);
 }
 #endif
 #endif
@@ -5021,6 +5253,11 @@ static int __init pci_setup(char *str)
                                pci_hotplug_io_size = memparse(str + 9, &str);
                        } else if (!strncmp(str, "hpmemsize=", 10)) {
                                pci_hotplug_mem_size = memparse(str + 10, &str);
+                       } else if (!strncmp(str, "hpbussize=", 10)) {
+                               pci_hotplug_bus_size =
+                                       simple_strtoul(str + 10, &str, 0);
+                               if (pci_hotplug_bus_size > 0xff)
+                                       pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE;
                        } else if (!strncmp(str, "pcie_bus_tune_off", 17)) {
                                pcie_bus_config = PCIE_BUS_TUNE_OFF;
                        } else if (!strncmp(str, "pcie_bus_safe", 13)) {
index a814bbb..9730c47 100644 (file)
@@ -82,6 +82,8 @@ void pci_pm_init(struct pci_dev *dev);
 void pci_ea_init(struct pci_dev *dev);
 void pci_allocate_cap_save_buffers(struct pci_dev *dev);
 void pci_free_cap_save_buffers(struct pci_dev *dev);
+void pci_bridge_d3_device_changed(struct pci_dev *dev);
+void pci_bridge_d3_device_removed(struct pci_dev *dev);
 
 static inline void pci_wakeup_event(struct pci_dev *dev)
 {
@@ -94,6 +96,15 @@ static inline bool pci_has_subordinate(struct pci_dev *pci_dev)
        return !!(pci_dev->subordinate);
 }
 
+static inline bool pci_power_manageable(struct pci_dev *pci_dev)
+{
+       /*
+        * Currently we allow normal PCI devices and PCI bridges transition
+        * into D3 if their bridge_d3 is set.
+        */
+       return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3;
+}
+
 struct pci_vpd_ops {
        ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
        ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
index 22ca641..7fcea75 100644 (file)
@@ -83,7 +83,7 @@ config PCIE_PME
        depends on PCIEPORTBUS && PM
 
 config PCIE_DPC
-       tristate "PCIe Downstream Port Containment support"
+       bool "PCIe Downstream Port Containment support"
        depends on PCIEPORTBUS
        default n
        help
@@ -92,6 +92,3 @@ config PCIE_DPC
          will be handled by the DPC driver.  If your system doesn't
          have this capability or you do not want to use this feature,
          it is safe to answer N.
-
-         To compile this driver as a module, choose M here: the module
-         will be called pcie-dpc.
index 2dfe7fd..0ec649d 100644 (file)
@@ -139,7 +139,7 @@ static void pcie_set_clkpm_nocheck(struct pcie_link_state *link, int enable)
 static void pcie_set_clkpm(struct pcie_link_state *link, int enable)
 {
        /* Don't enable Clock PM if the link is not Clock PM capable */
-       if (!link->clkpm_capable && enable)
+       if (!link->clkpm_capable)
                enable = 0;
        /* Need nothing if the specified equals to current state */
        if (link->clkpm_enabled == enable)
index ab552f1..250f878 100644 (file)
@@ -15,8 +15,8 @@
 
 struct dpc_dev {
        struct pcie_device      *dev;
-       struct work_struct      work;
-       int                     cap_pos;
+       struct work_struct      work;
+       int                     cap_pos;
 };
 
 static void dpc_wait_link_inactive(struct pci_dev *pdev)
@@ -89,7 +89,7 @@ static int dpc_probe(struct pcie_device *dev)
        int status;
        u16 ctl, cap;
 
-       dpc = kzalloc(sizeof(*dpc), GFP_KERNEL);
+       dpc = devm_kzalloc(&dev->device, sizeof(*dpc), GFP_KERNEL);
        if (!dpc)
                return -ENOMEM;
 
@@ -98,11 +98,12 @@ static int dpc_probe(struct pcie_device *dev)
        INIT_WORK(&dpc->work, interrupt_event_handler);
        set_service_data(dev, dpc);
 
-       status = request_irq(dev->irq, dpc_irq, IRQF_SHARED, "pcie-dpc", dpc);
+       status = devm_request_irq(&dev->device, dev->irq, dpc_irq, IRQF_SHARED,
+                                 "pcie-dpc", dpc);
        if (status) {
                dev_warn(&dev->device, "request IRQ%d failed: %d\n", dev->irq,
                         status);
-               goto out;
+               return status;
        }
 
        pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
@@ -117,9 +118,6 @@ static int dpc_probe(struct pcie_device *dev)
                FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), (cap >> 8) & 0xf,
                FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE));
        return status;
- out:
-       kfree(dpc);
-       return status;
 }
 
 static void dpc_remove(struct pcie_device *dev)
@@ -131,14 +129,11 @@ static void dpc_remove(struct pcie_device *dev)
        pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
        ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
        pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
-
-       free_irq(dev->irq, dpc);
-       kfree(dpc);
 }
 
 static struct pcie_port_service_driver dpcdriver = {
        .name           = "dpc",
-       .port_type      = PCI_EXP_TYPE_ROOT_PORT | PCI_EXP_TYPE_DOWNSTREAM,
+       .port_type      = PCIE_ANY_PORT,
        .service        = PCIE_PORT_SERVICE_DPC,
        .probe          = dpc_probe,
        .remove         = dpc_remove,
index 32d4d0a..e9270b4 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/pm.h>
+#include <linux/pm_runtime.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/pcieport_if.h>
@@ -342,6 +343,8 @@ static int pcie_device_init(struct pci_dev *pdev, int service, int irq)
                return retval;
        }
 
+       pm_runtime_no_callbacks(device);
+
        return 0;
 }
 
index be35da2..70d7ad8 100644 (file)
@@ -93,6 +93,26 @@ static int pcie_port_resume_noirq(struct device *dev)
        return 0;
 }
 
+static int pcie_port_runtime_suspend(struct device *dev)
+{
+       return to_pci_dev(dev)->bridge_d3 ? 0 : -EBUSY;
+}
+
+static int pcie_port_runtime_resume(struct device *dev)
+{
+       return 0;
+}
+
+static int pcie_port_runtime_idle(struct device *dev)
+{
+       /*
+        * Assume the PCI core has set bridge_d3 whenever it thinks the port
+        * should be good to go to D3.  Everything else, including moving
+        * the port to D3, is handled by the PCI core.
+        */
+       return to_pci_dev(dev)->bridge_d3 ? 0 : -EBUSY;
+}
+
 static const struct dev_pm_ops pcie_portdrv_pm_ops = {
        .suspend        = pcie_port_device_suspend,
        .resume         = pcie_port_device_resume,
@@ -101,6 +121,9 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
        .poweroff       = pcie_port_device_suspend,
        .restore        = pcie_port_device_resume,
        .resume_noirq   = pcie_port_resume_noirq,
+       .runtime_suspend = pcie_port_runtime_suspend,
+       .runtime_resume = pcie_port_runtime_resume,
+       .runtime_idle   = pcie_port_runtime_idle,
 };
 
 #define PCIE_PORTDRV_PM_OPS    (&pcie_portdrv_pm_ops)
@@ -134,16 +157,39 @@ static int pcie_portdrv_probe(struct pci_dev *dev,
                return status;
 
        pci_save_state(dev);
+
        /*
-        * D3cold may not work properly on some PCIe port, so disable
-        * it by default.
+        * Prevent runtime PM if the port is advertising support for PCIe
+        * hotplug.  Otherwise the BIOS hotplug SMI code might not be able
+        * to enumerate devices behind this port properly (the port is
+        * powered down preventing all config space accesses to the
+        * subordinate devices).  We can't be sure for native PCIe hotplug
+        * either so prevent that as well.
         */
-       dev->d3cold_allowed = false;
+       if (!dev->is_hotplug_bridge) {
+               /*
+                * Keep the port resumed 100ms to make sure things like
+                * config space accesses from userspace (lspci) will not
+                * cause the port to repeatedly suspend and resume.
+                */
+               pm_runtime_set_autosuspend_delay(&dev->dev, 100);
+               pm_runtime_use_autosuspend(&dev->dev);
+               pm_runtime_mark_last_busy(&dev->dev);
+               pm_runtime_put_autosuspend(&dev->dev);
+               pm_runtime_allow(&dev->dev);
+       }
+
        return 0;
 }
 
 static void pcie_portdrv_remove(struct pci_dev *dev)
 {
+       if (!dev->is_hotplug_bridge) {
+               pm_runtime_forbid(&dev->dev);
+               pm_runtime_get_noresume(&dev->dev);
+               pm_runtime_dont_use_autosuspend(&dev->dev);
+       }
+
        pcie_port_device_remove(dev);
 }
 
index 8e3ef72..93f280d 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/aer.h>
 #include <linux/acpi.h>
 #include <linux/irqdomain.h>
+#include <linux/pm_runtime.h>
 #include "pci.h"
 
 #define CARDBUS_LATENCY_TIMER  176     /* secondary latency timer */
@@ -832,6 +833,12 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass)
        u8 primary, secondary, subordinate;
        int broken = 0;
 
+       /*
+        * Make sure the bridge is powered on to be able to access config
+        * space of devices below it.
+        */
+       pm_runtime_get_sync(&dev->dev);
+
        pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses);
        primary = buses & 0xFF;
        secondary = (buses >> 8) & 0xFF;
@@ -1012,6 +1019,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass)
 out:
        pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
 
+       pm_runtime_put(&dev->dev);
+
        return max;
 }
 EXPORT_SYMBOL(pci_scan_bridge);
@@ -2076,6 +2085,15 @@ unsigned int pci_scan_child_bus(struct pci_bus *bus)
                                max = pci_scan_bridge(bus, dev, max, pass);
                }
 
+       /*
+        * Make sure a hotplug bridge has at least the minimum requested
+        * number of buses.
+        */
+       if (bus->self && bus->self->is_hotplug_bridge && pci_hotplug_bus_size) {
+               if (max - bus->busn_res.start < pci_hotplug_bus_size - 1)
+                       max = bus->busn_res.start + pci_hotplug_bus_size - 1;
+       }
+
        /*
         * We've scanned the bus and so we know all about what's on
         * the other side of any bridges that may be on this bus plus
@@ -2127,7 +2145,9 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
        b->sysdata = sysdata;
        b->ops = ops;
        b->number = b->busn_res.start = bus;
-       pci_bus_assign_domain_nr(b, parent);
+#ifdef CONFIG_PCI_DOMAINS_GENERIC
+       b->domain_nr = pci_bus_find_domain_nr(b, parent);
+#endif
        b2 = pci_find_bus(pci_domain_nr(b), bus);
        if (b2) {
                /* If we already got to this bus through a different bridge, ignore it */
index 3f155e7..2408abe 100644 (file)
@@ -231,7 +231,7 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct pci_dev *dev = PDE_DATA(file_inode(file));
        struct pci_filp_private *fpriv = file->private_data;
-       int i, ret;
+       int i, ret, write_combine;
 
        if (!capable(CAP_SYS_RAWIO))
                return -EPERM;
@@ -245,9 +245,12 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
        if (i >= PCI_ROM_RESOURCE)
                return -ENODEV;
 
+       if (fpriv->mmap_state == pci_mmap_mem)
+               write_combine = fpriv->write_combine;
+       else
+               write_combine = 0;
        ret = pci_mmap_page_range(dev, vma,
-                                 fpriv->mmap_state,
-                                 fpriv->write_combine);
+                                 fpriv->mmap_state, write_combine);
        if (ret < 0)
                return ret;
 
index ee72ebe..37ff015 100644 (file)
@@ -3189,13 +3189,15 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
 }
 
 /*
- * Atheros AR93xx chips do not behave after a bus reset.  The device will
- * throw a Link Down error on AER-capable systems and regardless of AER,
- * config space of the device is never accessible again and typically
- * causes the system to hang or reset when access is attempted.
+ * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset.
+ * The device will throw a Link Down error on AER-capable systems and
+ * regardless of AER, config space of the device is never accessible again
+ * and typically causes the system to hang or reset when access is attempted.
  * http://www.spinics.net/lists/linux-pci/msg34797.html
  */
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0030, quirk_no_bus_reset);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0032, quirk_no_bus_reset);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset);
 
 static void quirk_no_pm_reset(struct pci_dev *dev)
 {
@@ -3711,6 +3713,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9172,
 /* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c59 */
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x917a,
                         quirk_dma_func1_alias);
+/* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c78 */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9182,
+                        quirk_dma_func1_alias);
 /* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c46 */
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x91a0,
                         quirk_dma_func1_alias);
@@ -3747,6 +3752,9 @@ static const struct pci_device_id fixed_dma_alias_tbl[] = {
        { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x0285,
                         PCI_VENDOR_ID_ADAPTEC2, 0x02bb), /* Adaptec 3405 */
          .driver_data = PCI_DEVFN(1, 0) },
+       { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x0285,
+                        PCI_VENDOR_ID_ADAPTEC2, 0x02bc), /* Adaptec 3805 */
+         .driver_data = PCI_DEVFN(1, 0) },
        { 0 }
 };
 
@@ -4087,6 +4095,7 @@ static const struct pci_dev_acs_enabled {
        { PCI_VENDOR_ID_AMD, 0x7809, pci_quirk_amd_sb_acs },
        { PCI_VENDOR_ID_SOLARFLARE, 0x0903, pci_quirk_mf_endpoint_acs },
        { PCI_VENDOR_ID_SOLARFLARE, 0x0923, pci_quirk_mf_endpoint_acs },
+       { PCI_VENDOR_ID_SOLARFLARE, 0x0A03, pci_quirk_mf_endpoint_acs },
        { PCI_VENDOR_ID_INTEL, 0x10C6, pci_quirk_mf_endpoint_acs },
        { PCI_VENDOR_ID_INTEL, 0x10DB, pci_quirk_mf_endpoint_acs },
        { PCI_VENDOR_ID_INTEL, 0x10DD, pci_quirk_mf_endpoint_acs },
index 8982026..d1ef7ac 100644 (file)
@@ -96,6 +96,8 @@ static void pci_remove_bus_device(struct pci_dev *dev)
                dev->subordinate = NULL;
        }
 
+       pci_bridge_d3_device_removed(dev);
+
        pci_destroy_dev(dev);
 }
 
index d678c46..c74059e 100644 (file)
@@ -1428,6 +1428,74 @@ void pci_bus_assign_resources(const struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pci_bus_assign_resources);
 
+static void pci_claim_device_resources(struct pci_dev *dev)
+{
+       int i;
+
+       for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) {
+               struct resource *r = &dev->resource[i];
+
+               if (!r->flags || r->parent)
+                       continue;
+
+               pci_claim_resource(dev, i);
+       }
+}
+
+static void pci_claim_bridge_resources(struct pci_dev *dev)
+{
+       int i;
+
+       for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) {
+               struct resource *r = &dev->resource[i];
+
+               if (!r->flags || r->parent)
+                       continue;
+
+               pci_claim_bridge_resource(dev, i);
+       }
+}
+
+static void pci_bus_allocate_dev_resources(struct pci_bus *b)
+{
+       struct pci_dev *dev;
+       struct pci_bus *child;
+
+       list_for_each_entry(dev, &b->devices, bus_list) {
+               pci_claim_device_resources(dev);
+
+               child = dev->subordinate;
+               if (child)
+                       pci_bus_allocate_dev_resources(child);
+       }
+}
+
+static void pci_bus_allocate_resources(struct pci_bus *b)
+{
+       struct pci_bus *child;
+
+       /*
+        * Carry out a depth-first search on the PCI bus
+        * tree to allocate bridge apertures. Read the
+        * programmed bridge bases and recursively claim
+        * the respective bridge resources.
+        */
+       if (b->self) {
+               pci_read_bridge_bases(b);
+               pci_claim_bridge_resources(b->self);
+       }
+
+       list_for_each_entry(child, &b->children, node)
+               pci_bus_allocate_resources(child);
+}
+
+void pci_bus_claim_resources(struct pci_bus *b)
+{
+       pci_bus_allocate_resources(b);
+       pci_bus_allocate_dev_resources(b);
+}
+EXPORT_SYMBOL(pci_bus_claim_resources);
+
 static void __pci_bridge_assign_resources(const struct pci_dev *bridge,
                                          struct list_head *add_head,
                                          struct list_head *fail_head)
index 0ac520d..c71df0c 100644 (file)
@@ -46,7 +46,8 @@ struct read_info_sccb {
        u64     rnmax2;                 /* 104-111 */
        u8      _pad_112[116 - 112];    /* 112-115 */
        u8      fac116;                 /* 116 */
-       u8      _pad_117[119 - 117];    /* 117-118 */
+       u8      fac117;                 /* 117 */
+       u8      _pad_118;               /* 118 */
        u8      fac119;                 /* 119 */
        u16     hcpua;                  /* 120-121 */
        u8      _pad_122[124 - 122];    /* 122-123 */
@@ -114,7 +115,12 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
        sclp.facilities = sccb->facilities;
        sclp.has_sprp = !!(sccb->fac84 & 0x02);
        sclp.has_core_type = !!(sccb->fac84 & 0x01);
+       sclp.has_gsls = !!(sccb->fac85 & 0x80);
+       sclp.has_64bscao = !!(sccb->fac116 & 0x80);
+       sclp.has_cmma = !!(sccb->fac116 & 0x40);
        sclp.has_esca = !!(sccb->fac116 & 0x08);
+       sclp.has_pfmfi = !!(sccb->fac117 & 0x40);
+       sclp.has_ibs = !!(sccb->fac117 & 0x20);
        sclp.has_hvs = !!(sccb->fac119 & 0x80);
        if (sccb->fac85 & 0x02)
                S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;
@@ -145,6 +151,10 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
                sclp.has_siif = cpue->siif;
                sclp.has_sigpif = cpue->sigpif;
                sclp.has_sief2 = cpue->sief2;
+               sclp.has_gpere = cpue->gpere;
+               sclp.has_ib = cpue->ib;
+               sclp.has_cei = cpue->cei;
+               sclp.has_skey = cpue->skey;
                break;
        }
 
index 2553db0..f59b717 100644 (file)
@@ -26,7 +26,7 @@
 #define OCF_LENGTH_CPC_NAME 8UL
 
 static char hmc_network[OCF_LENGTH_HMC_NETWORK + 1];
-static char cpc_name[OCF_LENGTH_CPC_NAME + 1];
+static char cpc_name[OCF_LENGTH_CPC_NAME]; /* in EBCDIC */
 
 static DEFINE_SPINLOCK(sclp_ocf_lock);
 static struct work_struct sclp_ocf_change_work;
@@ -72,9 +72,8 @@ static void sclp_ocf_handler(struct evbuf_header *evbuf)
        }
        if (cpc) {
                size = min(OCF_LENGTH_CPC_NAME, (size_t) cpc->length);
+               memset(cpc_name, 0, OCF_LENGTH_CPC_NAME);
                memcpy(cpc_name, cpc + 1, size);
-               EBCASC(cpc_name, size);
-               cpc_name[size] = 0;
        }
        spin_unlock(&sclp_ocf_lock);
        schedule_work(&sclp_ocf_change_work);
@@ -85,15 +84,23 @@ static struct sclp_register sclp_ocf_event = {
        .receiver_fn = sclp_ocf_handler,
 };
 
+void sclp_ocf_cpc_name_copy(char *dst)
+{
+       spin_lock_irq(&sclp_ocf_lock);
+       memcpy(dst, cpc_name, OCF_LENGTH_CPC_NAME);
+       spin_unlock_irq(&sclp_ocf_lock);
+}
+EXPORT_SYMBOL(sclp_ocf_cpc_name_copy);
+
 static ssize_t cpc_name_show(struct kobject *kobj,
                             struct kobj_attribute *attr, char *page)
 {
-       int rc;
+       char name[OCF_LENGTH_CPC_NAME + 1];
 
-       spin_lock_irq(&sclp_ocf_lock);
-       rc = snprintf(page, PAGE_SIZE, "%s\n", cpc_name);
-       spin_unlock_irq(&sclp_ocf_lock);
-       return rc;
+       sclp_ocf_cpc_name_copy(name);
+       name[OCF_LENGTH_CPC_NAME] = 0;
+       EBCASC(name, OCF_LENGTH_CPC_NAME);
+       return snprintf(page, PAGE_SIZE, "%s\n", name);
 }
 
 static struct kobj_attribute cpc_name_attr =
index adf61b4..734a042 100644 (file)
@@ -4854,20 +4854,17 @@ static int
 lpfc_enable_pci_dev(struct lpfc_hba *phba)
 {
        struct pci_dev *pdev;
-       int bars = 0;
 
        /* Obtain PCI device reference */
        if (!phba->pcidev)
                goto out_error;
        else
                pdev = phba->pcidev;
-       /* Select PCI BARs */
-       bars = pci_select_bars(pdev, IORESOURCE_MEM);
        /* Enable PCI device */
        if (pci_enable_device_mem(pdev))
                goto out_error;
        /* Request PCI resource for the device */
-       if (pci_request_selected_regions(pdev, bars, LPFC_DRIVER_NAME))
+       if (pci_request_mem_regions(pdev, LPFC_DRIVER_NAME))
                goto out_disable_device;
        /* Set up device as PCI master and save state for EEH */
        pci_set_master(pdev);
@@ -4884,7 +4881,7 @@ out_disable_device:
        pci_disable_device(pdev);
 out_error:
        lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
-                       "1401 Failed to enable pci device, bars:x%x\n", bars);
+                       "1401 Failed to enable pci device\n");
        return -ENODEV;
 }
 
@@ -4899,17 +4896,14 @@ static void
 lpfc_disable_pci_dev(struct lpfc_hba *phba)
 {
        struct pci_dev *pdev;
-       int bars;
 
        /* Obtain PCI device reference */
        if (!phba->pcidev)
                return;
        else
                pdev = phba->pcidev;
-       /* Select PCI BARs */
-       bars = pci_select_bars(pdev, IORESOURCE_MEM);
        /* Release PCI resource and disable PCI device */
-       pci_release_selected_regions(pdev, bars);
+       pci_release_mem_regions(pdev);
        pci_disable_device(pdev);
 
        return;
@@ -9811,7 +9805,6 @@ lpfc_pci_remove_one_s3(struct pci_dev *pdev)
        struct lpfc_vport **vports;
        struct lpfc_hba   *phba = vport->phba;
        int i;
-       int bars = pci_select_bars(pdev, IORESOURCE_MEM);
 
        spin_lock_irq(&phba->hbalock);
        vport->load_flag |= FC_UNLOADING;
@@ -9886,7 +9879,7 @@ lpfc_pci_remove_one_s3(struct pci_dev *pdev)
 
        lpfc_hba_free(phba);
 
-       pci_release_selected_regions(pdev, bars);
+       pci_release_mem_regions(pdev);
        pci_disable_device(pdev);
 }
 
index c10972f..4fd041b 100644 (file)
@@ -387,7 +387,7 @@ static int xhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
         * need to have the registers polled during D3, so avoid D3cold.
         */
        if (xhci->quirks & XHCI_COMP_MODE_QUIRK)
-               pdev->no_d3cold = true;
+               pci_d3cold_disable(pdev);
 
        if (xhci->quirks & XHCI_PME_STUCK_QUIRK)
                xhci_pme_quirk(hcd);
index 26a9d10..d5b6f95 100644 (file)
@@ -1730,7 +1730,8 @@ enum {
        POOL_WRITE      = 2,
 };
 
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+                               s64 pool, struct ceph_string *pool_ns)
 {
        struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
        struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1738,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
        struct rb_node **p, *parent;
        struct ceph_pool_perm *perm;
        struct page **pages;
+       size_t pool_ns_len;
        int err = 0, err2 = 0, have = 0;
 
        down_read(&mdsc->pool_perm_rwsem);
@@ -1749,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                else if (pool > perm->pool)
                        p = &(*p)->rb_right;
                else {
-                       have = perm->perm;
-                       break;
+                       int ret = ceph_compare_string(pool_ns,
+                                               perm->pool_ns,
+                                               perm->pool_ns_len);
+                       if (ret < 0)
+                               p = &(*p)->rb_left;
+                       else if (ret > 0)
+                               p = &(*p)->rb_right;
+                       else {
+                               have = perm->perm;
+                               break;
+                       }
                }
        }
        up_read(&mdsc->pool_perm_rwsem);
        if (*p)
                goto out;
 
-       dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+       if (pool_ns)
+               dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+                    pool, (int)pool_ns->len, pool_ns->str);
+       else
+               dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
 
        down_write(&mdsc->pool_perm_rwsem);
+       p = &mdsc->pool_perm_tree.rb_node;
        parent = NULL;
        while (*p) {
                parent = *p;
@@ -1769,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                else if (pool > perm->pool)
                        p = &(*p)->rb_right;
                else {
-                       have = perm->perm;
-                       break;
+                       int ret = ceph_compare_string(pool_ns,
+                                               perm->pool_ns,
+                                               perm->pool_ns_len);
+                       if (ret < 0)
+                               p = &(*p)->rb_left;
+                       else if (ret > 0)
+                               p = &(*p)->rb_right;
+                       else {
+                               have = perm->perm;
+                               break;
+                       }
                }
        }
        if (*p) {
@@ -1788,6 +1813,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
        rd_req->r_flags = CEPH_OSD_FLAG_READ;
        osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
        rd_req->r_base_oloc.pool = pool;
+       if (pool_ns)
+               rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
        ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
 
        err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
@@ -1841,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                goto out_unlock;
        }
 
-       perm = kmalloc(sizeof(*perm), GFP_NOFS);
+       pool_ns_len = pool_ns ? pool_ns->len : 0;
+       perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
        if (!perm) {
                err = -ENOMEM;
                goto out_unlock;
@@ -1849,6 +1877,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
        perm->pool = pool;
        perm->perm = have;
+       perm->pool_ns_len = pool_ns_len;
+       if (pool_ns_len > 0)
+               memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+       perm->pool_ns[pool_ns_len] = 0;
+
        rb_link_node(&perm->node, parent, p);
        rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
        err = 0;
@@ -1860,43 +1893,46 @@ out_unlock:
 out:
        if (!err)
                err = have;
-       dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+       if (pool_ns)
+               dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+                    pool, (int)pool_ns->len, pool_ns->str, err);
+       else
+               dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
        return err;
 }
 
 int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
 {
-       u32 pool;
+       s64 pool;
+       struct ceph_string *pool_ns;
        int ret, flags;
 
-       /* does not support pool namespace yet */
-       if (ci->i_pool_ns_len)
-               return -EIO;
-
        if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
                                NOPOOLPERM))
                return 0;
 
        spin_lock(&ci->i_ceph_lock);
        flags = ci->i_ceph_flags;
-       pool = ceph_file_layout_pg_pool(ci->i_layout);
+       pool = ci->i_layout.pool_id;
        spin_unlock(&ci->i_ceph_lock);
 check:
        if (flags & CEPH_I_POOL_PERM) {
                if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
-                       dout("ceph_pool_perm_check pool %u no read perm\n",
+                       dout("ceph_pool_perm_check pool %lld no read perm\n",
                             pool);
                        return -EPERM;
                }
                if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
-                       dout("ceph_pool_perm_check pool %u no write perm\n",
+                       dout("ceph_pool_perm_check pool %lld no write perm\n",
                             pool);
                        return -EPERM;
                }
                return 0;
        }
 
-       ret = __ceph_pool_perm_get(ci, pool);
+       pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+       ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+       ceph_put_string(pool_ns);
        if (ret < 0)
                return ret;
 
@@ -1907,10 +1943,11 @@ check:
                flags |= CEPH_I_POOL_WR;
 
        spin_lock(&ci->i_ceph_lock);
-       if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
-               ci->i_ceph_flags = flags;
+       if (pool == ci->i_layout.pool_id &&
+           pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+               ci->i_ceph_flags |= flags;
         } else {
-               pool = ceph_file_layout_pg_pool(ci->i_layout);
+               pool = ci->i_layout.pool_id;
                flags = ci->i_ceph_flags;
        }
        spin_unlock(&ci->i_ceph_lock);
index 238c55b..5bc5d37 100644 (file)
@@ -71,7 +71,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
                                              &ceph_fscache_fsid_object_def,
                                              fsc, true);
        if (!fsc->fscache)
-               pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+               pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
 
        return 0;
 }
index 6f60d0a..99115ca 100644 (file)
  * cluster to release server state.
  */
 
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+                                struct ceph_mds_session *session,
+                                struct ceph_inode_info *ci,
+                                u64 oldest_flush_tid);
 
 /*
  * Generate readable cap strings for debugging output.
@@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
  */
 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 {
-       int want = 0;
-       int mode;
-       for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
-               if (ci->i_nr_by_mode[mode])
-                       want |= ceph_caps_for_mode(mode);
-       return want;
+       int i, bits = 0;
+       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+               if (ci->i_nr_by_mode[i])
+                       bits |= 1 << i;
+       }
+       if (bits == 0)
+               return 0;
+       return ceph_caps_for_mode(bits >> 1);
 }
 
 /*
@@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
                        u32 seq, u64 flush_tid, u64 oldest_flush_tid,
                        u32 issue_seq, u32 mseq, u64 size, u64 max_size,
                        struct timespec *mtime, struct timespec *atime,
-                       struct timespec *ctime, u64 time_warp_seq,
+                       struct timespec *ctime, u32 time_warp_seq,
                        kuid_t uid, kgid_t gid, umode_t mode,
                        u64 xattr_version,
                        struct ceph_buffer *xattrs_buf,
@@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        struct inode *inode = &ci->vfs_inode;
        u64 cap_id = cap->cap_id;
        int held, revoking, dropping, keep;
-       u64 seq, issue_seq, mseq, time_warp_seq, follows;
-       u64 size, max_size;
+       u64 follows, size, max_size;
+       u32 seq, issue_seq, mseq, time_warp_seq;
        struct timespec mtime, atime, ctime;
        int wake = 0;
        umode_t mode;
@@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        return delayed;
 }
 
+static inline int __send_flush_snap(struct inode *inode,
+                                   struct ceph_mds_session *session,
+                                   struct ceph_cap_snap *capsnap,
+                                   u32 mseq, u64 oldest_flush_tid)
+{
+       return send_cap_msg(session, ceph_vino(inode).ino, 0,
+                       CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+                       capsnap->dirty, 0, capsnap->cap_flush.tid,
+                       oldest_flush_tid, 0, mseq, capsnap->size, 0,
+                       &capsnap->mtime, &capsnap->atime,
+                       &capsnap->ctime, capsnap->time_warp_seq,
+                       capsnap->uid, capsnap->gid, capsnap->mode,
+                       capsnap->xattr_version, capsnap->xattr_blob,
+                       capsnap->follows, capsnap->inline_data);
+}
+
 /*
  * When a snapshot is taken, clients accumulate dirty metadata on
  * inodes with capabilities in ceph_cap_snaps to describe the file
@@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  * asynchronously back to the MDS once sync writes complete and dirty
  * data is written out.
  *
- * Unless @kick is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
  * Called under i_ceph_lock.  Takes s_mutex as needed.
  */
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
-                       struct ceph_mds_session **psession,
-                       int kick)
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+                              struct ceph_mds_session *session)
                __releases(ci->i_ceph_lock)
                __acquires(ci->i_ceph_lock)
 {
        struct inode *inode = &ci->vfs_inode;
-       int mds;
+       struct ceph_mds_client *mdsc = session->s_mdsc;
        struct ceph_cap_snap *capsnap;
-       u32 mseq;
-       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-       struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
-                                                   session->s_mutex */
-       u64 next_follows = 0;  /* keep track of how far we've gotten through the
-                            i_cap_snaps list, and skip these entries next time
-                            around to avoid an infinite loop */
+       u64 oldest_flush_tid = 0;
+       u64 first_tid = 1, last_tid = 0;
 
-       if (psession)
-               session = *psession;
+       dout("__flush_snaps %p session %p\n", inode, session);
 
-       dout("__flush_snaps %p\n", inode);
-retry:
        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
-               /* avoid an infiniute loop after retry */
-               if (capsnap->follows < next_follows)
-                       continue;
                /*
                 * we need to wait for sync writes to complete and for dirty
                 * pages to be written out.
@@ -1263,97 +1271,129 @@ retry:
                /* should be removed by ceph_try_drop_cap_snap() */
                BUG_ON(!capsnap->need_flush);
 
-               /* pick mds, take s_mutex */
-               if (ci->i_auth_cap == NULL) {
-                       dout("no auth cap (migrating?), doing nothing\n");
-                       goto out;
-               }
-
                /* only flush each capsnap once */
-               if (!kick && !list_empty(&capsnap->flushing_item)) {
-                       dout("already flushed %p, skipping\n", capsnap);
+               if (capsnap->cap_flush.tid > 0) {
+                       dout(" already flushed %p, skipping\n", capsnap);
                        continue;
                }
 
-               mds = ci->i_auth_cap->session->s_mds;
-               mseq = ci->i_auth_cap->mseq;
+               spin_lock(&mdsc->cap_dirty_lock);
+               capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+               list_add_tail(&capsnap->cap_flush.g_list,
+                             &mdsc->cap_flush_list);
+               if (oldest_flush_tid == 0)
+                       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+               if (list_empty(&ci->i_flushing_item)) {
+                       list_add_tail(&ci->i_flushing_item,
+                                     &session->s_cap_flushing);
+               }
+               spin_unlock(&mdsc->cap_dirty_lock);
+
+               list_add_tail(&capsnap->cap_flush.i_list,
+                             &ci->i_cap_flush_list);
 
-               if (session && session->s_mds != mds) {
-                       dout("oops, wrong session %p mutex\n", session);
-                       if (kick)
-                               goto out;
+               if (first_tid == 1)
+                       first_tid = capsnap->cap_flush.tid;
+               last_tid = capsnap->cap_flush.tid;
+       }
 
-                       mutex_unlock(&session->s_mutex);
-                       ceph_put_mds_session(session);
-                       session = NULL;
+       ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+       while (first_tid <= last_tid) {
+               struct ceph_cap *cap = ci->i_auth_cap;
+               struct ceph_cap_flush *cf;
+               int ret;
+
+               if (!(cap && cap->session == session)) {
+                       dout("__flush_snaps %p auth cap %p not mds%d, "
+                            "stop\n", inode, cap, session->s_mds);
+                       break;
                }
-               if (!session) {
-                       spin_unlock(&ci->i_ceph_lock);
-                       mutex_lock(&mdsc->mutex);
-                       session = __ceph_lookup_mds_session(mdsc, mds);
-                       mutex_unlock(&mdsc->mutex);
-                       if (session) {
-                               dout("inverting session/ino locks on %p\n",
-                                    session);
-                               mutex_lock(&session->s_mutex);
+
+               ret = -ENOENT;
+               list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+                       if (cf->tid >= first_tid) {
+                               ret = 0;
+                               break;
                        }
-                       /*
-                        * if session == NULL, we raced against a cap
-                        * deletion or migration.  retry, and we'll
-                        * get a better @mds value next time.
-                        */
-                       spin_lock(&ci->i_ceph_lock);
-                       goto retry;
                }
+               if (ret < 0)
+                       break;
 
-               spin_lock(&mdsc->cap_dirty_lock);
-               capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
-               spin_unlock(&mdsc->cap_dirty_lock);
+               first_tid = cf->tid + 1;
 
+               capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
                atomic_inc(&capsnap->nref);
-               if (list_empty(&capsnap->flushing_item))
-                       list_add_tail(&capsnap->flushing_item,
-                                     &session->s_cap_snaps_flushing);
                spin_unlock(&ci->i_ceph_lock);
 
-               dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
-                    inode, capsnap, capsnap->follows, capsnap->flush_tid);
-               send_cap_msg(session, ceph_vino(inode).ino, 0,
-                            CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
-                            capsnap->dirty, 0, capsnap->flush_tid, 0,
-                            0, mseq, capsnap->size, 0,
-                            &capsnap->mtime, &capsnap->atime,
-                            &capsnap->ctime, capsnap->time_warp_seq,
-                            capsnap->uid, capsnap->gid, capsnap->mode,
-                            capsnap->xattr_version, capsnap->xattr_blob,
-                            capsnap->follows, capsnap->inline_data);
-
-               next_follows = capsnap->follows + 1;
-               ceph_put_cap_snap(capsnap);
+               dout("__flush_snaps %p capsnap %p tid %llu %s\n",
+                    inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+
+               ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+                                       oldest_flush_tid);
+               if (ret < 0) {
+                       pr_err("__flush_snaps: error sending cap flushsnap, "
+                              "ino (%llx.%llx) tid %llu follows %llu\n",
+                               ceph_vinop(inode), cf->tid, capsnap->follows);
+               }
 
+               ceph_put_cap_snap(capsnap);
                spin_lock(&ci->i_ceph_lock);
-               goto retry;
        }
+}
 
-       /* we flushed them all; remove this inode from the queue */
-       spin_lock(&mdsc->snap_flush_lock);
-       list_del_init(&ci->i_snap_flush_item);
-       spin_unlock(&mdsc->snap_flush_lock);
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+                     struct ceph_mds_session **psession)
+{
+       struct inode *inode = &ci->vfs_inode;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_session *session = *psession;
+       int mds;
+       dout("ceph_flush_snaps %p\n", inode);
+retry:
+       spin_lock(&ci->i_ceph_lock);
+       if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+               dout(" no capsnap needs flush, doing nothing\n");
+               goto out;
+       }
+       if (!ci->i_auth_cap) {
+               dout(" no auth cap (migrating?), doing nothing\n");
+               goto out;
+       }
 
-out:
-       if (psession)
-               *psession = session;
-       else if (session) {
+       mds = ci->i_auth_cap->session->s_mds;
+       if (session && session->s_mds != mds) {
+               dout(" oops, wrong session %p mutex\n", session);
                mutex_unlock(&session->s_mutex);
                ceph_put_mds_session(session);
+               session = NULL;
+       }
+       if (!session) {
+               spin_unlock(&ci->i_ceph_lock);
+               mutex_lock(&mdsc->mutex);
+               session = __ceph_lookup_mds_session(mdsc, mds);
+               mutex_unlock(&mdsc->mutex);
+               if (session) {
+                       dout(" inverting session/ino locks on %p\n", session);
+                       mutex_lock(&session->s_mutex);
+               }
+               goto retry;
        }
-}
 
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
-       spin_lock(&ci->i_ceph_lock);
-       __ceph_flush_snaps(ci, NULL, 0);
+       __ceph_flush_snaps(ci, session);
+out:
        spin_unlock(&ci->i_ceph_lock);
+
+       if (psession) {
+               *psession = session;
+       } else {
+               mutex_unlock(&session->s_mutex);
+               ceph_put_mds_session(session);
+       }
+       /* we flushed them all; remove this inode from the queue */
+       spin_lock(&mdsc->snap_flush_lock);
+       list_del_init(&ci->i_snap_flush_item);
+       spin_unlock(&mdsc->snap_flush_lock);
 }
 
 /*
@@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
        return dirty;
 }
 
-static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
-                                       struct ceph_cap_flush *cf)
-{
-       struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_cap_flush *other = NULL;
-
-       while (*p) {
-               parent = *p;
-               other = rb_entry(parent, struct ceph_cap_flush, i_node);
-
-               if (cf->tid < other->tid)
-                       p = &(*p)->rb_left;
-               else if (cf->tid > other->tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&cf->i_node, parent, p);
-       rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
-}
-
-static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
-                                      struct ceph_cap_flush *cf)
-{
-       struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_cap_flush *other = NULL;
-
-       while (*p) {
-               parent = *p;
-               other = rb_entry(parent, struct ceph_cap_flush, g_node);
-
-               if (cf->tid < other->tid)
-                       p = &(*p)->rb_left;
-               else if (cf->tid > other->tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&cf->g_node, parent, p);
-       rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
-}
-
 struct ceph_cap_flush *ceph_alloc_cap_flush(void)
 {
        return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
@@ -1470,15 +1464,46 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf)
 
 static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
 {
-       struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
-       if (n) {
+       if (!list_empty(&mdsc->cap_flush_list)) {
                struct ceph_cap_flush *cf =
-                       rb_entry(n, struct ceph_cap_flush, g_node);
+                       list_first_entry(&mdsc->cap_flush_list,
+                                        struct ceph_cap_flush, g_list);
                return cf->tid;
        }
        return 0;
 }
 
+/*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
+                              struct ceph_inode_info *ci,
+                              struct ceph_cap_flush *cf)
+{
+       struct ceph_cap_flush *prev;
+       bool wake = cf->wake;
+       if (mdsc) {
+               /* are there older pending cap flushes? */
+               if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+                       prev = list_prev_entry(cf, g_list);
+                       prev->wake = true;
+                       wake = false;
+               }
+               list_del(&cf->g_list);
+       } else if (ci) {
+               if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+                       prev = list_prev_entry(cf, i_list);
+                       prev->wake = true;
+                       wake = false;
+               }
+               list_del(&cf->i_list);
+       } else {
+               BUG_ON(1);
+       }
+       return wake;
+}
+
 /*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
@@ -1486,7 +1511,7 @@ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
  * Called under i_ceph_lock.
  */
 static int __mark_caps_flushing(struct inode *inode,
-                               struct ceph_mds_session *session,
+                               struct ceph_mds_session *session, bool wake,
                                u64 *flush_tid, u64 *oldest_flush_tid)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode,
 
        swap(cf, ci->i_prealloc_cap_flush);
        cf->caps = flushing;
+       cf->wake = wake;
 
        spin_lock(&mdsc->cap_dirty_lock);
        list_del_init(&ci->i_dirty_item);
 
        cf->tid = ++mdsc->last_cap_flush_tid;
-       __add_cap_flushing_to_mdsc(mdsc, cf);
+       list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
        *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
 
        if (list_empty(&ci->i_flushing_item)) {
                list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
                mdsc->num_cap_flushing++;
-               dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
-       } else {
-               list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
-               dout(" inode %p now flushing (more) tid %llu\n",
-                    inode, cf->tid);
        }
        spin_unlock(&mdsc->cap_dirty_lock);
 
-       __add_cap_flushing_to_inode(ci, cf);
+       list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
 
        *flush_tid = cf->tid;
        return flushing;
@@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        int mds = -1;   /* keep track of how far we've gone through i_caps list
                           to avoid an infinite loop on retry */
        struct rb_node *p;
-       int tried_invalidate = 0;
-       int delayed = 0, sent = 0, force_requeue = 0, num;
-       int queue_invalidate = 0;
-       int is_delayed = flags & CHECK_CAPS_NODELAY;
+       int delayed = 0, sent = 0, num;
+       bool is_delayed = flags & CHECK_CAPS_NODELAY;
+       bool queue_invalidate = false;
+       bool force_requeue = false;
+       bool tried_invalidate = false;
 
        /* if we are unmounting, flush any unused caps immediately. */
        if (mdsc->stopping)
@@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        if (ci->i_ceph_flags & CEPH_I_FLUSH)
                flags |= CHECK_CAPS_FLUSH;
 
-       /* flush snaps first time around only */
-       if (!list_empty(&ci->i_cap_snaps))
-               __ceph_flush_snaps(ci, &session, 0);
        goto retry_locked;
 retry:
        spin_lock(&ci->i_ceph_lock);
@@ -1666,17 +1685,17 @@ retry_locked:
                        if (revoking & (CEPH_CAP_FILE_CACHE|
                                        CEPH_CAP_FILE_LAZYIO)) {
                                dout("check_caps queuing invalidate\n");
-                               queue_invalidate = 1;
+                               queue_invalidate = true;
                                ci->i_rdcache_revoking = ci->i_rdcache_gen;
                        } else {
                                dout("check_caps failed to invalidate pages\n");
                                /* we failed to invalidate pages.  check these
                                   caps again later. */
-                               force_requeue = 1;
+                               force_requeue = true;
                                __cap_set_timeouts(mdsc, ci);
                        }
                }
-               tried_invalidate = 1;
+               tried_invalidate = true;
                goto retry_locked;
        }
 
@@ -1720,10 +1739,15 @@ retry_locked:
                        }
                }
                /* flush anything dirty? */
-               if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
-                   ci->i_dirty_caps) {
-                       dout("flushing dirty caps\n");
-                       goto ack;
+               if (cap == ci->i_auth_cap) {
+                       if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+                               dout("flushing dirty caps\n");
+                               goto ack;
+                       }
+                       if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+                               dout("flushing snap caps\n");
+                               goto ack;
+                       }
                }
 
                /* completed revocation? going down and there are no caps? */
@@ -1782,6 +1806,26 @@ ack:
                                goto retry;
                        }
                }
+
+               /* kick flushing and flush snaps before sending normal
+                * cap message */
+               if (cap == ci->i_auth_cap &&
+                   (ci->i_ceph_flags &
+                    (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+                       if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+                               spin_lock(&mdsc->cap_dirty_lock);
+                               oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+                               spin_unlock(&mdsc->cap_dirty_lock);
+                               __kick_flushing_caps(mdsc, session, ci,
+                                                    oldest_flush_tid);
+                               ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+                       }
+                       if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+                               __ceph_flush_snaps(ci, session);
+
+                       goto retry_locked;
+               }
+
                /* take snap_rwsem after session mutex */
                if (!took_snap_rwsem) {
                        if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
@@ -1796,7 +1840,7 @@ ack:
                }
 
                if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
-                       flushing = __mark_caps_flushing(inode, session,
+                       flushing = __mark_caps_flushing(inode, session, false,
                                                        &flush_tid,
                                                        &oldest_flush_tid);
                } else {
@@ -1822,7 +1866,7 @@ ack:
         * otherwise cancel.
         */
        if (delayed && is_delayed)
-               force_requeue = 1;   /* __send_cap delayed release; requeue */
+               force_requeue = true;   /* __send_cap delayed release; requeue */
        if (!delayed && !is_delayed)
                __cap_delay_cancel(mdsc, ci);
        else if (!is_delayed || force_requeue)
@@ -1873,8 +1917,8 @@ retry:
                if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
                        goto out;
 
-               flushing = __mark_caps_flushing(inode, session, &flush_tid,
-                                               &oldest_flush_tid);
+               flushing = __mark_caps_flushing(inode, session, true,
+                                               &flush_tid, &oldest_flush_tid);
 
                /* __send_cap drops i_ceph_lock */
                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
@@ -1887,10 +1931,11 @@ retry:
                        spin_unlock(&ci->i_ceph_lock);
                }
        } else {
-               struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
-               if (n) {
+               if (!list_empty(&ci->i_cap_flush_list)) {
                        struct ceph_cap_flush *cf =
-                               rb_entry(n, struct ceph_cap_flush, i_node);
+                               list_last_entry(&ci->i_cap_flush_list,
+                                               struct ceph_cap_flush, i_list);
+                       cf->wake = true;
                        flush_tid = cf->tid;
                }
                flushing = ci->i_flushing_caps;
@@ -1910,14 +1955,13 @@ out:
 static int caps_are_flushed(struct inode *inode, u64 flush_tid)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_cap_flush *cf;
-       struct rb_node *n;
        int ret = 1;
 
        spin_lock(&ci->i_ceph_lock);
-       n = rb_first(&ci->i_cap_flush_tree);
-       if (n) {
-               cf = rb_entry(n, struct ceph_cap_flush, i_node);
+       if (!list_empty(&ci->i_cap_flush_list)) {
+               struct ceph_cap_flush * cf =
+                       list_first_entry(&ci->i_cap_flush_list,
+                                        struct ceph_cap_flush, i_list);
                if (cf->tid <= flush_tid)
                        ret = 0;
        }
@@ -1925,53 +1969,6 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
        return ret;
 }
 
-/*
- * Wait on any unsafe replies for the given inode.  First wait on the
- * newest request, and make that the upper bound.  Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-static void sync_write_wait(struct inode *inode)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct list_head *head = &ci->i_unsafe_writes;
-       struct ceph_osd_request *req;
-       u64 last_tid;
-
-       if (!S_ISREG(inode->i_mode))
-               return;
-
-       spin_lock(&ci->i_unsafe_lock);
-       if (list_empty(head))
-               goto out;
-
-       /* set upper bound as _last_ entry in chain */
-       req = list_last_entry(head, struct ceph_osd_request,
-                             r_unsafe_item);
-       last_tid = req->r_tid;
-
-       do {
-               ceph_osdc_get_request(req);
-               spin_unlock(&ci->i_unsafe_lock);
-               dout("sync_write_wait on tid %llu (until %llu)\n",
-                    req->r_tid, last_tid);
-               wait_for_completion(&req->r_safe_completion);
-               spin_lock(&ci->i_unsafe_lock);
-               ceph_osdc_put_request(req);
-
-               /*
-                * from here on look at first entry in chain, since we
-                * only want to wait for anything older than last_tid
-                */
-               if (list_empty(head))
-                       break;
-               req = list_first_entry(head, struct ceph_osd_request,
-                                      r_unsafe_item);
-       } while (req->r_tid < last_tid);
-out:
-       spin_unlock(&ci->i_unsafe_lock);
-}
-
 /*
  * wait for any unsafe requests to complete.
  */
@@ -2024,7 +2021,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        int dirty;
 
        dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
-       sync_write_wait(inode);
+
+       ceph_sync_write_wait(inode);
 
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret < 0)
@@ -2087,87 +2085,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
        return err;
 }
 
-/*
- * After a recovering MDS goes active, we need to resend any caps
- * we were flushing.
- *
- * Caller holds session->s_mutex.
- */
-static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
-                                  struct ceph_mds_session *session)
-{
-       struct ceph_cap_snap *capsnap;
-
-       dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
-       list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
-                           flushing_item) {
-               struct ceph_inode_info *ci = capsnap->ci;
-               struct inode *inode = &ci->vfs_inode;
-               struct ceph_cap *cap;
-
-               spin_lock(&ci->i_ceph_lock);
-               cap = ci->i_auth_cap;
-               if (cap && cap->session == session) {
-                       dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
-                            cap, capsnap);
-                       __ceph_flush_snaps(ci, &session, 1);
-               } else {
-                       pr_err("%p auth cap %p not mds%d ???\n", inode,
-                              cap, session->s_mds);
-               }
-               spin_unlock(&ci->i_ceph_lock);
-       }
-}
-
-static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
-                               struct ceph_mds_session *session,
-                               struct ceph_inode_info *ci)
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+                                struct ceph_mds_session *session,
+                                struct ceph_inode_info *ci,
+                                u64 oldest_flush_tid)
+       __releases(ci->i_ceph_lock)
+       __acquires(ci->i_ceph_lock)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap *cap;
        struct ceph_cap_flush *cf;
-       struct rb_node *n;
-       int delayed = 0;
+       int ret;
        u64 first_tid = 0;
-       u64 oldest_flush_tid;
 
-       spin_lock(&mdsc->cap_dirty_lock);
-       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
-       spin_unlock(&mdsc->cap_dirty_lock);
+       list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+               if (cf->tid < first_tid)
+                       continue;
 
-       while (true) {
-               spin_lock(&ci->i_ceph_lock);
                cap = ci->i_auth_cap;
                if (!(cap && cap->session == session)) {
-                       pr_err("%p auth cap %p not mds%d ???\n", inode,
-                                       cap, session->s_mds);
-                       spin_unlock(&ci->i_ceph_lock);
+                       pr_err("%p auth cap %p not mds%d ???\n",
+                              inode, cap, session->s_mds);
                        break;
                }
 
-               for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
-                       cf = rb_entry(n, struct ceph_cap_flush, i_node);
-                       if (cf->tid >= first_tid)
-                               break;
-               }
-               if (!n) {
+               first_tid = cf->tid + 1;
+
+               if (cf->caps) {
+                       dout("kick_flushing_caps %p cap %p tid %llu %s\n",
+                            inode, cap, cf->tid, ceph_cap_string(cf->caps));
+                       ci->i_ceph_flags |= CEPH_I_NODELAY;
+                       ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                         __ceph_caps_used(ci),
+                                         __ceph_caps_wanted(ci),
+                                         cap->issued | cap->implemented,
+                                         cf->caps, cf->tid, oldest_flush_tid);
+                       if (ret) {
+                               pr_err("kick_flushing_caps: error sending "
+                                       "cap flush, ino (%llx.%llx) "
+                                       "tid %llu flushing %s\n",
+                                       ceph_vinop(inode), cf->tid,
+                                       ceph_cap_string(cf->caps));
+                       }
+               } else {
+                       struct ceph_cap_snap *capsnap =
+                                       container_of(cf, struct ceph_cap_snap,
+                                                   cap_flush);
+                       dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
+                            inode, capsnap, cf->tid,
+                            ceph_cap_string(capsnap->dirty));
+
+                       atomic_inc(&capsnap->nref);
                        spin_unlock(&ci->i_ceph_lock);
-                       break;
-               }
 
-               cf = rb_entry(n, struct ceph_cap_flush, i_node);
+                       ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+                                               oldest_flush_tid);
+                       if (ret < 0) {
+                               pr_err("kick_flushing_caps: error sending "
+                                       "cap flushsnap, ino (%llx.%llx) "
+                                       "tid %llu follows %llu\n",
+                                       ceph_vinop(inode), cf->tid,
+                                       capsnap->follows);
+                       }
 
-               first_tid = cf->tid + 1;
+                       ceph_put_cap_snap(capsnap);
+               }
 
-               dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
-                    cap, cf->tid, ceph_cap_string(cf->caps));
-               delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-                                     __ceph_caps_used(ci),
-                                     __ceph_caps_wanted(ci),
-                                     cap->issued | cap->implemented,
-                                     cf->caps, cf->tid, oldest_flush_tid);
+               spin_lock(&ci->i_ceph_lock);
        }
-       return delayed;
 }
 
 void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
@@ -2175,8 +2160,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
 {
        struct ceph_inode_info *ci;
        struct ceph_cap *cap;
+       u64 oldest_flush_tid;
 
        dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+       spin_unlock(&mdsc->cap_dirty_lock);
+
        list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
                spin_lock(&ci->i_ceph_lock);
                cap = ci->i_auth_cap;
@@ -2196,10 +2187,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
                 */
                if ((cap->issued & ci->i_flushing_caps) !=
                    ci->i_flushing_caps) {
-                       spin_unlock(&ci->i_ceph_lock);
-                       if (!__kick_flushing_caps(mdsc, session, ci))
-                               continue;
-                       spin_lock(&ci->i_ceph_lock);
+                       ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+                       __kick_flushing_caps(mdsc, session, ci,
+                                            oldest_flush_tid);
+               } else {
+                       ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
                }
 
                spin_unlock(&ci->i_ceph_lock);
@@ -2210,50 +2202,56 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                             struct ceph_mds_session *session)
 {
        struct ceph_inode_info *ci;
-
-       kick_flushing_capsnaps(mdsc, session);
+       struct ceph_cap *cap;
+       u64 oldest_flush_tid;
 
        dout("kick_flushing_caps mds%d\n", session->s_mds);
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+       spin_unlock(&mdsc->cap_dirty_lock);
+
        list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
-               int delayed = __kick_flushing_caps(mdsc, session, ci);
-               if (delayed) {
-                       spin_lock(&ci->i_ceph_lock);
-                       __cap_delay_requeue(mdsc, ci);
+               spin_lock(&ci->i_ceph_lock);
+               cap = ci->i_auth_cap;
+               if (!(cap && cap->session == session)) {
+                       pr_err("%p auth cap %p not mds%d ???\n",
+                               &ci->vfs_inode, cap, session->s_mds);
                        spin_unlock(&ci->i_ceph_lock);
+                       continue;
+               }
+               if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+                       ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+                       __kick_flushing_caps(mdsc, session, ci,
+                                            oldest_flush_tid);
                }
+               spin_unlock(&ci->i_ceph_lock);
        }
 }
 
 static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
                                     struct ceph_mds_session *session,
                                     struct inode *inode)
+       __releases(ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_cap *cap;
 
-       spin_lock(&ci->i_ceph_lock);
        cap = ci->i_auth_cap;
        dout("kick_flushing_inode_caps %p flushing %s\n", inode,
             ceph_cap_string(ci->i_flushing_caps));
 
-       __ceph_flush_snaps(ci, &session, 1);
-
-       if (ci->i_flushing_caps) {
-               int delayed;
-
+       if (!list_empty(&ci->i_cap_flush_list)) {
+               u64 oldest_flush_tid;
                spin_lock(&mdsc->cap_dirty_lock);
                list_move_tail(&ci->i_flushing_item,
                               &cap->session->s_cap_flushing);
+               oldest_flush_tid = __get_oldest_flush_tid(mdsc);
                spin_unlock(&mdsc->cap_dirty_lock);
 
+               ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+               __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
                spin_unlock(&ci->i_ceph_lock);
-
-               delayed = __kick_flushing_caps(mdsc, session, ci);
-               if (delayed) {
-                       spin_lock(&ci->i_ceph_lock);
-                       __cap_delay_requeue(mdsc, ci);
-                       spin_unlock(&ci->i_ceph_lock);
-               }
        } else {
                spin_unlock(&ci->i_ceph_lock);
        }
@@ -2580,16 +2578,19 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
  * drop cap_snap that is not associated with any snapshot.
  * we don't need to send FLUSHSNAP message for it.
  */
-static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
+                                 struct ceph_cap_snap *capsnap)
 {
        if (!capsnap->need_flush &&
            !capsnap->writing && !capsnap->dirty_pages) {
-
                dout("dropping cap_snap %p follows %llu\n",
                     capsnap, capsnap->follows);
+               BUG_ON(capsnap->cap_flush.tid > 0);
                ceph_put_snap_context(capsnap->context);
+               if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
+                       ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+
                list_del(&capsnap->ci_item);
-               list_del(&capsnap->flushing_item);
                ceph_put_cap_snap(capsnap);
                return 1;
        }
@@ -2636,7 +2637,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
                                                        struct ceph_cap_snap,
                                                        ci_item);
                                capsnap->writing = 0;
-                               if (ceph_try_drop_cap_snap(capsnap))
+                               if (ceph_try_drop_cap_snap(ci, capsnap))
                                        put++;
                                else if (__ceph_finish_cap_snap(ci, capsnap))
                                        flushsnaps = 1;
@@ -2661,7 +2662,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        if (last && !flushsnaps)
                ceph_check_caps(ci, 0, NULL);
        else if (flushsnaps)
-               ceph_flush_snaps(ci);
+               ceph_flush_snaps(ci, NULL);
        if (wake)
                wake_up_all(&ci->i_cap_wq);
        while (put-- > 0)
@@ -2679,15 +2680,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                                struct ceph_snap_context *snapc)
 {
        struct inode *inode = &ci->vfs_inode;
-       int last = 0;
-       int complete_capsnap = 0;
-       int drop_capsnap = 0;
-       int found = 0;
        struct ceph_cap_snap *capsnap = NULL;
+       int put = 0;
+       bool last = false;
+       bool found = false;
+       bool flush_snaps = false;
+       bool complete_capsnap = false;
 
        spin_lock(&ci->i_ceph_lock);
        ci->i_wrbuffer_ref -= nr;
-       last = !ci->i_wrbuffer_ref;
+       if (ci->i_wrbuffer_ref == 0) {
+               last = true;
+               put++;
+       }
 
        if (ci->i_head_snapc == snapc) {
                ci->i_wrbuffer_ref_head -= nr;
@@ -2707,15 +2712,22 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        } else {
                list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                        if (capsnap->context == snapc) {
-                               found = 1;
+                               found = true;
                                break;
                        }
                }
                BUG_ON(!found);
                capsnap->dirty_pages -= nr;
                if (capsnap->dirty_pages == 0) {
-                       complete_capsnap = 1;
-                       drop_capsnap = ceph_try_drop_cap_snap(capsnap);
+                       complete_capsnap = true;
+                       if (!capsnap->writing) {
+                               if (ceph_try_drop_cap_snap(ci, capsnap)) {
+                                       put++;
+                               } else {
+                                       ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+                                       flush_snaps = true;
+                               }
+                       }
                }
                dout("put_wrbuffer_cap_refs on %p cap_snap %p "
                     " snap %lld %d/%d -> %d/%d %s%s\n",
@@ -2730,12 +2742,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 
        if (last) {
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
-               iput(inode);
-       } else if (complete_capsnap) {
-               ceph_flush_snaps(ci);
-               wake_up_all(&ci->i_cap_wq);
+       } else if (flush_snaps) {
+               ceph_flush_snaps(ci, NULL);
        }
-       if (drop_capsnap)
+       if (complete_capsnap)
+               wake_up_all(&ci->i_cap_wq);
+       while (put-- > 0)
                iput(inode);
 }
 
@@ -2779,12 +2791,11 @@ static void invalidate_aliases(struct inode *inode)
  */
 static void handle_cap_grant(struct ceph_mds_client *mdsc,
                             struct inode *inode, struct ceph_mds_caps *grant,
-                            u64 inline_version,
-                            void *inline_data, int inline_len,
+                            struct ceph_string **pns, u64 inline_version,
+                            void *inline_data, u32 inline_len,
                             struct ceph_buffer *xattr_buf,
                             struct ceph_mds_session *session,
-                            struct ceph_cap *cap, int issued,
-                            u32 pool_ns_len)
+                            struct ceph_cap *cap, int issued)
        __releases(ci->i_ceph_lock)
        __releases(mdsc->snap_rwsem)
 {
@@ -2895,8 +2906,18 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 
        if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
                /* file layout may have changed */
-               ci->i_layout = grant->layout;
-               ci->i_pool_ns_len = pool_ns_len;
+               s64 old_pool = ci->i_layout.pool_id;
+               struct ceph_string *old_ns;
+
+               ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
+               old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+                                       lockdep_is_held(&ci->i_ceph_lock));
+               rcu_assign_pointer(ci->i_layout.pool_ns, *pns);
+
+               if (ci->i_layout.pool_id != old_pool || *pns != old_ns)
+                       ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+               *pns = old_ns;
 
                /* size/truncate_seq? */
                queue_trunc = ceph_fill_file_size(inode, issued,
@@ -2979,13 +3000,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
                        fill_inline = true;
        }
 
-       spin_unlock(&ci->i_ceph_lock);
-
        if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
-               kick_flushing_inode_caps(mdsc, session, inode);
-               up_read(&mdsc->snap_rwsem);
                if (newcaps & ~issued)
                        wake = true;
+               kick_flushing_inode_caps(mdsc, session, inode);
+               up_read(&mdsc->snap_rwsem);
+       } else {
+               spin_unlock(&ci->i_ceph_lock);
        }
 
        if (fill_inline)
@@ -3029,23 +3050,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
-       struct ceph_cap_flush *cf;
-       struct rb_node *n;
+       struct ceph_cap_flush *cf, *tmp_cf;
        LIST_HEAD(to_remove);
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
-       int drop = 0;
+       bool drop = false;
+       bool wake_ci = 0;
+       bool wake_mdsc = 0;
 
-       n = rb_first(&ci->i_cap_flush_tree);
-       while (n) {
-               cf = rb_entry(n, struct ceph_cap_flush, i_node);
-               n = rb_next(&cf->i_node);
+       list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
                if (cf->tid == flush_tid)
                        cleaned = cf->caps;
+               if (cf->caps == 0) /* capsnap */
+                       continue;
                if (cf->tid <= flush_tid) {
-                       rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
-                       list_add_tail(&cf->list, &to_remove);
+                       if (__finish_cap_flush(NULL, ci, cf))
+                               wake_ci = true;
+                       list_add_tail(&cf->i_list, &to_remove);
                } else {
                        cleaned &= ~cf->caps;
                        if (!cleaned)
@@ -3066,31 +3088,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 
        spin_lock(&mdsc->cap_dirty_lock);
 
-       if (!list_empty(&to_remove)) {
-               list_for_each_entry(cf, &to_remove, list)
-                       rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
-
-               n = rb_first(&mdsc->cap_flush_tree);
-               cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
-               if (!cf || cf->tid > flush_tid)
-                       wake_up_all(&mdsc->cap_flushing_wq);
+       list_for_each_entry(cf, &to_remove, i_list) {
+               if (__finish_cap_flush(mdsc, NULL, cf))
+                       wake_mdsc = true;
        }
 
        if (ci->i_flushing_caps == 0) {
-               list_del_init(&ci->i_flushing_item);
-               if (!list_empty(&session->s_cap_flushing))
-                       dout(" mds%d still flushing cap on %p\n",
-                            session->s_mds,
-                            &list_entry(session->s_cap_flushing.next,
-                                        struct ceph_inode_info,
-                                        i_flushing_item)->vfs_inode);
+               if (list_empty(&ci->i_cap_flush_list)) {
+                       list_del_init(&ci->i_flushing_item);
+                       if (!list_empty(&session->s_cap_flushing)) {
+                               dout(" mds%d still flushing cap on %p\n",
+                                    session->s_mds,
+                                    &list_first_entry(&session->s_cap_flushing,
+                                               struct ceph_inode_info,
+                                               i_flushing_item)->vfs_inode);
+                       }
+               }
                mdsc->num_cap_flushing--;
                dout(" inode %p now !flushing\n", inode);
 
                if (ci->i_dirty_caps == 0) {
                        dout(" inode %p now clean\n", inode);
                        BUG_ON(!list_empty(&ci->i_dirty_item));
-                       drop = 1;
+                       drop = true;
                        if (ci->i_wr_ref == 0 &&
                            ci->i_wrbuffer_ref_head == 0) {
                                BUG_ON(!ci->i_head_snapc);
@@ -3102,17 +3122,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                }
        }
        spin_unlock(&mdsc->cap_dirty_lock);
-       wake_up_all(&ci->i_cap_wq);
 
 out:
        spin_unlock(&ci->i_ceph_lock);
 
        while (!list_empty(&to_remove)) {
                cf = list_first_entry(&to_remove,
-                                     struct ceph_cap_flush, list);
-               list_del(&cf->list);
+                                     struct ceph_cap_flush, i_list);
+               list_del(&cf->i_list);
                ceph_free_cap_flush(cf);
        }
+
+       if (wake_ci)
+               wake_up_all(&ci->i_cap_wq);
+       if (wake_mdsc)
+               wake_up_all(&mdsc->cap_flushing_wq);
        if (drop)
                iput(inode);
 }
@@ -3131,7 +3155,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        u64 follows = le64_to_cpu(m->snap_follows);
        struct ceph_cap_snap *capsnap;
-       int drop = 0;
+       bool flushed = false;
+       bool wake_ci = false;
+       bool wake_mdsc = false;
 
        dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
             inode, ci, session->s_mds, follows);
@@ -3139,30 +3165,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
        spin_lock(&ci->i_ceph_lock);
        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                if (capsnap->follows == follows) {
-                       if (capsnap->flush_tid != flush_tid) {
+                       if (capsnap->cap_flush.tid != flush_tid) {
                                dout(" cap_snap %p follows %lld tid %lld !="
                                     " %lld\n", capsnap, follows,
-                                    flush_tid, capsnap->flush_tid);
+                                    flush_tid, capsnap->cap_flush.tid);
                                break;
                        }
-                       WARN_ON(capsnap->dirty_pages || capsnap->writing);
-                       dout(" removing %p cap_snap %p follows %lld\n",
-                            inode, capsnap, follows);
-                       ceph_put_snap_context(capsnap->context);
-                       list_del(&capsnap->ci_item);
-                       list_del(&capsnap->flushing_item);
-                       ceph_put_cap_snap(capsnap);
-                       wake_up_all(&mdsc->cap_flushing_wq);
-                       drop = 1;
+                       flushed = true;
                        break;
                } else {
                        dout(" skipping cap_snap %p follows %lld\n",
                             capsnap, capsnap->follows);
                }
        }
+       if (flushed) {
+               WARN_ON(capsnap->dirty_pages || capsnap->writing);
+               dout(" removing %p cap_snap %p follows %lld\n",
+                    inode, capsnap, follows);
+               list_del(&capsnap->ci_item);
+               if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
+                       wake_ci = true;
+
+               spin_lock(&mdsc->cap_dirty_lock);
+
+               if (list_empty(&ci->i_cap_flush_list))
+                       list_del_init(&ci->i_flushing_item);
+
+               if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
+                       wake_mdsc = true;
+
+               spin_unlock(&mdsc->cap_dirty_lock);
+       }
        spin_unlock(&ci->i_ceph_lock);
-       if (drop)
+       if (flushed) {
+               ceph_put_snap_context(capsnap->context);
+               ceph_put_cap_snap(capsnap);
+               if (wake_ci)
+                       wake_up_all(&ci->i_cap_wq);
+               if (wake_mdsc)
+                       wake_up_all(&mdsc->cap_flushing_wq);
                iput(inode);
+       }
 }
 
 /*
@@ -3267,7 +3310,8 @@ retry:
                        tcap->implemented |= issued;
                        if (cap == ci->i_auth_cap)
                                ci->i_auth_cap = tcap;
-                       if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+                       if (!list_empty(&ci->i_cap_flush_list) &&
+                           ci->i_auth_cap == tcap) {
                                spin_lock(&mdsc->cap_dirty_lock);
                                list_move_tail(&ci->i_flushing_item,
                                               &tcap->session->s_cap_flushing);
@@ -3420,20 +3464,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_cap *cap;
        struct ceph_mds_caps *h;
        struct ceph_mds_cap_peer *peer = NULL;
-       struct ceph_snap_realm *realm;
+       struct ceph_snap_realm *realm = NULL;
+       struct ceph_string *pool_ns = NULL;
        int mds = session->s_mds;
        int op, issued;
        u32 seq, mseq;
        struct ceph_vino vino;
-       u64 cap_id;
-       u64 size, max_size;
        u64 tid;
        u64 inline_version = 0;
        void *inline_data = NULL;
        u32  inline_len = 0;
        void *snaptrace;
        size_t snaptrace_len;
-       u32 pool_ns_len = 0;
        void *p, *end;
 
        dout("handle_caps from mds%d\n", mds);
@@ -3447,11 +3489,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        op = le32_to_cpu(h->op);
        vino.ino = le64_to_cpu(h->ino);
        vino.snap = CEPH_NOSNAP;
-       cap_id = le64_to_cpu(h->cap_id);
        seq = le32_to_cpu(h->seq);
        mseq = le32_to_cpu(h->migrate_seq);
-       size = le64_to_cpu(h->size);
-       max_size = le64_to_cpu(h->max_size);
 
        snaptrace = h + 1;
        snaptrace_len = le32_to_cpu(h->snap_trace_len);
@@ -3490,6 +3529,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                u64 flush_tid;
                u32 caller_uid, caller_gid;
                u32 osd_epoch_barrier;
+               u32 pool_ns_len;
                /* version >= 5 */
                ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
                /* version >= 6 */
@@ -3499,6 +3539,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                ceph_decode_32_safe(&p, end, caller_gid, bad);
                /* version >= 8 */
                ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+               if (pool_ns_len > 0) {
+                       ceph_decode_need(&p, end, pool_ns_len, bad);
+                       pool_ns = ceph_find_or_create_string(p, pool_ns_len);
+                       p += pool_ns_len;
+               }
        }
 
        /* lookup ino */
@@ -3519,7 +3564,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                        cap = ceph_get_cap(mdsc, NULL);
                        cap->cap_ino = vino.ino;
                        cap->queue_release = 1;
-                       cap->cap_id = cap_id;
+                       cap->cap_id = le64_to_cpu(h->cap_id);
                        cap->mseq = mseq;
                        cap->seq = seq;
                        spin_lock(&session->s_cap_lock);
@@ -3554,10 +3599,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                }
                handle_cap_import(mdsc, inode, h, peer, session,
                                  &cap, &issued);
-               handle_cap_grant(mdsc, inode, h,
+               handle_cap_grant(mdsc, inode, h, &pool_ns,
                                 inline_version, inline_data, inline_len,
-                                msg->middle, session, cap, issued,
-                                pool_ns_len);
+                                msg->middle, session, cap, issued);
                if (realm)
                        ceph_put_snap_realm(mdsc, realm);
                goto done_unlocked;
@@ -3579,10 +3623,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_GRANT:
                __ceph_caps_issued(ci, &issued);
                issued |= __ceph_caps_dirty(ci);
-               handle_cap_grant(mdsc, inode, h,
+               handle_cap_grant(mdsc, inode, h, &pool_ns,
                                 inline_version, inline_data, inline_len,
-                                msg->middle, session, cap, issued,
-                                pool_ns_len);
+                                msg->middle, session, cap, issued);
                goto done_unlocked;
 
        case CEPH_CAP_OP_FLUSH_ACK:
@@ -3613,6 +3656,7 @@ done:
        mutex_unlock(&session->s_mutex);
 done_unlocked:
        iput(inode);
+       ceph_put_string(pool_ns);
        return;
 
 bad:
@@ -3673,6 +3717,16 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
        dout("flush_dirty_caps done\n");
 }
 
+void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+{
+       int i;
+       int bits = (fmode << 1) | 1;
+       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+               if (bits & (1 << i))
+                       ci->i_nr_by_mode[i]++;
+       }
+}
+
 /*
  * Drop open file reference.  If we were the last open file,
  * we may need to release capabilities to the MDS (or schedule
@@ -3680,15 +3734,20 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
  */
 void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
 {
-       struct inode *inode = &ci->vfs_inode;
-       int last = 0;
-
+       int i, last = 0;
+       int bits = (fmode << 1) | 1;
        spin_lock(&ci->i_ceph_lock);
-       dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
-            ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
-       BUG_ON(ci->i_nr_by_mode[fmode] == 0);
-       if (--ci->i_nr_by_mode[fmode] == 0)
-               last++;
+       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+               if (bits & (1 << i)) {
+                       BUG_ON(ci->i_nr_by_mode[i] == 0);
+                       if (--ci->i_nr_by_mode[i] == 0)
+                               last++;
+               }
+       }
+       dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
+            &ci->vfs_inode, fmode,
+            ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
+            ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
        spin_unlock(&ci->i_ceph_lock);
 
        if (last && ci->i_vino.snap == CEPH_NOSNAP)
index 6e0fedf..c64a0b7 100644 (file)
@@ -59,7 +59,7 @@ int ceph_init_dentry(struct dentry *dentry)
 
        di->dentry = dentry;
        di->lease_session = NULL;
-       dentry->d_time = jiffies;
+       di->time = jiffies;
        /* avoid reordering d_fsdata setup so that the check above is safe */
        smp_mb();
        dentry->d_fsdata = di;
@@ -1124,7 +1124,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 void ceph_invalidate_dentry_lease(struct dentry *dentry)
 {
        spin_lock(&dentry->d_lock);
-       dentry->d_time = jiffies;
+       ceph_dentry(dentry)->time = jiffies;
        ceph_dentry(dentry)->lease_shared_gen = 0;
        spin_unlock(&dentry->d_lock);
 }
@@ -1133,7 +1133,8 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
  * Check if dentry lease is valid.  If not, delete the lease.  Try to
  * renew if the least is more than half up.
  */
-static int dentry_lease_is_valid(struct dentry *dentry)
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
+                                struct inode *dir)
 {
        struct ceph_dentry_info *di;
        struct ceph_mds_session *s;
@@ -1141,12 +1142,11 @@ static int dentry_lease_is_valid(struct dentry *dentry)
        u32 gen;
        unsigned long ttl;
        struct ceph_mds_session *session = NULL;
-       struct inode *dir = NULL;
        u32 seq = 0;
 
        spin_lock(&dentry->d_lock);
        di = ceph_dentry(dentry);
-       if (di->lease_session) {
+       if (di && di->lease_session) {
                s = di->lease_session;
                spin_lock(&s->s_gen_ttl_lock);
                gen = s->s_cap_gen;
@@ -1154,17 +1154,24 @@ static int dentry_lease_is_valid(struct dentry *dentry)
                spin_unlock(&s->s_gen_ttl_lock);
 
                if (di->lease_gen == gen &&
-                   time_before(jiffies, dentry->d_time) &&
+                   time_before(jiffies, di->time) &&
                    time_before(jiffies, ttl)) {
                        valid = 1;
                        if (di->lease_renew_after &&
                            time_after(jiffies, di->lease_renew_after)) {
-                               /* we should renew */
-                               dir = d_inode(dentry->d_parent);
-                               session = ceph_get_mds_session(s);
-                               seq = di->lease_seq;
-                               di->lease_renew_after = 0;
-                               di->lease_renew_from = jiffies;
+                               /*
+                                * We should renew. If we're in RCU walk mode
+                                * though, we can't do that so just return
+                                * -ECHILD.
+                                */
+                               if (flags & LOOKUP_RCU) {
+                                       valid = -ECHILD;
+                               } else {
+                                       session = ceph_get_mds_session(s);
+                                       seq = di->lease_seq;
+                                       di->lease_renew_after = 0;
+                                       di->lease_renew_from = jiffies;
+                               }
                        }
                }
        }
@@ -1207,15 +1214,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        struct dentry *parent;
        struct inode *dir;
 
-       if (flags & LOOKUP_RCU)
-               return -ECHILD;
+       if (flags & LOOKUP_RCU) {
+               parent = ACCESS_ONCE(dentry->d_parent);
+               dir = d_inode_rcu(parent);
+               if (!dir)
+                       return -ECHILD;
+       } else {
+               parent = dget_parent(dentry);
+               dir = d_inode(parent);
+       }
 
        dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
             dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
 
-       parent = dget_parent(dentry);
-       dir = d_inode(parent);
-
        /* always trust cached snapped dentries, snapdir dentry */
        if (ceph_snap(dir) != CEPH_NOSNAP) {
                dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1224,12 +1235,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        } else if (d_really_is_positive(dentry) &&
                   ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
                valid = 1;
-       } else if (dentry_lease_is_valid(dentry) ||
-                  dir_lease_is_valid(dir, dentry)) {
-               if (d_really_is_positive(dentry))
-                       valid = ceph_is_any_caps(d_inode(dentry));
-               else
-                       valid = 1;
+       } else {
+               valid = dentry_lease_is_valid(dentry, flags, dir);
+               if (valid == -ECHILD)
+                       return valid;
+               if (valid || dir_lease_is_valid(dir, dentry)) {
+                       if (d_really_is_positive(dentry))
+                               valid = ceph_is_any_caps(d_inode(dentry));
+                       else
+                               valid = 1;
+               }
        }
 
        if (!valid) {
@@ -1238,6 +1253,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
                struct ceph_mds_request *req;
                int op, mask, err;
 
+               if (flags & LOOKUP_RCU)
+                       return -ECHILD;
+
                op = ceph_snap(dir) == CEPH_SNAPDIR ?
                        CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
                req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
@@ -1273,7 +1291,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
                ceph_dir_clear_complete(dir);
        }
 
-       dput(parent);
+       if (!(flags & LOOKUP_RCU))
+               dput(parent);
        return valid;
 }
 
@@ -1286,10 +1305,14 @@ static void ceph_d_release(struct dentry *dentry)
 
        dout("d_release %p\n", dentry);
        ceph_dentry_lru_del(dentry);
+
+       spin_lock(&dentry->d_lock);
+       dentry->d_fsdata = NULL;
+       spin_unlock(&dentry->d_lock);
+
        if (di->lease_session)
                ceph_put_mds_session(di->lease_session);
        kmem_cache_free(ceph_dentry_cachep, di);
-       dentry->d_fsdata = NULL;
 }
 
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
index 0daaf7c..0f5375d 100644 (file)
@@ -708,7 +708,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
                }
        }
 
-       ceph_put_page_vector(osd_data->pages, num_pages, false);
+       ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write);
        ceph_osdc_put_request(req);
 
        if (rc < 0)
@@ -821,6 +821,54 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
        }
 }
 
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+void ceph_sync_write_wait(struct inode *inode)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct list_head *head = &ci->i_unsafe_writes;
+       struct ceph_osd_request *req;
+       u64 last_tid;
+
+       if (!S_ISREG(inode->i_mode))
+               return;
+
+       spin_lock(&ci->i_unsafe_lock);
+       if (list_empty(head))
+               goto out;
+
+       /* set upper bound as _last_ entry in chain */
+
+       req = list_last_entry(head, struct ceph_osd_request,
+                             r_unsafe_item);
+       last_tid = req->r_tid;
+
+       do {
+               ceph_osdc_get_request(req);
+               spin_unlock(&ci->i_unsafe_lock);
+
+               dout("sync_write_wait on tid %llu (until %llu)\n",
+                    req->r_tid, last_tid);
+               wait_for_completion(&req->r_safe_completion);
+               ceph_osdc_put_request(req);
+
+               spin_lock(&ci->i_unsafe_lock);
+               /*
+                * from here on look at first entry in chain, since we
+                * only want to wait for anything older than last_tid
+                */
+               if (list_empty(head))
+                       break;
+               req = list_first_entry(head, struct ceph_osd_request,
+                                      r_unsafe_item);
+       } while (req->r_tid < last_tid);
+out:
+       spin_unlock(&ci->i_unsafe_lock);
+}
 
 static ssize_t
 ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
@@ -964,7 +1012,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                len = ret;
                }
 
-               ceph_put_page_vector(pages, num_pages, false);
+               ceph_put_page_vector(pages, num_pages, !write);
 
                ceph_osdc_put_request(req);
                if (ret < 0)
@@ -985,6 +1033,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
        }
 
        if (aio_req) {
+               LIST_HEAD(osd_reqs);
+
                if (aio_req->num_reqs == 0) {
                        kfree(aio_req);
                        return ret;
@@ -993,8 +1043,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
                                              CEPH_CAP_FILE_RD);
 
-               while (!list_empty(&aio_req->osd_reqs)) {
-                       req = list_first_entry(&aio_req->osd_reqs,
+               list_splice(&aio_req->osd_reqs, &osd_reqs);
+               while (!list_empty(&osd_reqs)) {
+                       req = list_first_entry(&osd_reqs,
                                               struct ceph_osd_request,
                                               r_unsafe_item);
                        list_del_init(&req->r_unsafe_item);
@@ -1448,16 +1499,14 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        loff_t i_size;
-       int ret;
+       loff_t ret;
 
        inode_lock(inode);
 
        if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
                ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
-               if (ret < 0) {
-                       offset = ret;
+               if (ret < 0)
                        goto out;
-               }
        }
 
        i_size = i_size_read(inode);
@@ -1473,7 +1522,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
                 * write() or lseek() might have altered it
                 */
                if (offset == 0) {
-                       offset = file->f_pos;
+                       ret = file->f_pos;
                        goto out;
                }
                offset += file->f_pos;
@@ -1493,11 +1542,11 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
                break;
        }
 
-       offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+       ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
        inode_unlock(inode);
-       return offset;
+       return ret;
 }
 
 static inline void ceph_zero_partial_page(
@@ -1583,9 +1632,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
 {
        int ret = 0;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
-       s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
-       s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+       s32 stripe_unit = ci->i_layout.stripe_unit;
+       s32 stripe_count = ci->i_layout.stripe_count;
+       s32 object_size = ci->i_layout.object_size;
        u64 object_set_size = object_size * stripe_count;
        u64 nearly, t;
 
index 99bdef6..dd3a6db 100644 (file)
@@ -446,7 +446,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_symlink = NULL;
 
        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
-       ci->i_pool_ns_len = 0;
+       RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
 
        ci->i_fragtree = RB_ROOT;
        mutex_init(&ci->i_fragtree_mutex);
@@ -468,7 +468,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&ci->i_dirty_item);
        INIT_LIST_HEAD(&ci->i_flushing_item);
        ci->i_prealloc_cap_flush = NULL;
-       ci->i_cap_flush_tree = RB_ROOT;
+       INIT_LIST_HEAD(&ci->i_cap_flush_list);
        init_waitqueue_head(&ci->i_cap_wq);
        ci->i_hold_caps_min = 0;
        ci->i_hold_caps_max = 0;
@@ -477,7 +477,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_head_snapc = NULL;
        ci->i_snap_caps = 0;
 
-       for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+       for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
                ci->i_nr_by_mode[i] = 0;
 
        mutex_init(&ci->i_truncate_mutex);
@@ -570,6 +570,8 @@ void ceph_destroy_inode(struct inode *inode)
        if (ci->i_xattrs.prealloc_blob)
                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 
+       ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+
        call_rcu(&inode->i_rcu, ceph_i_callback);
 }
 
@@ -583,6 +585,14 @@ int ceph_drop_inode(struct inode *inode)
        return 1;
 }
 
+void ceph_evict_inode(struct inode *inode)
+{
+       /* wait unsafe sync writes */
+       ceph_sync_write_wait(inode);
+       truncate_inode_pages_final(&inode->i_data);
+       clear_inode(inode);
+}
+
 static inline blkcnt_t calc_inode_blocks(u64 size)
 {
        return (size + (1<<9) - 1) >> 9;
@@ -733,6 +743,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
        int issued = 0, implemented, new_issued;
        struct timespec mtime, atime, ctime;
        struct ceph_buffer *xattr_blob = NULL;
+       struct ceph_string *pool_ns = NULL;
        struct ceph_cap *new_cap = NULL;
        int err = 0;
        bool wake = false;
@@ -760,6 +771,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                               iinfo->xattr_len);
        }
 
+       if (iinfo->pool_ns_len > 0)
+               pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
+                                                    iinfo->pool_ns_len);
+
        spin_lock(&ci->i_ceph_lock);
 
        /*
@@ -814,10 +829,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
        if (new_version ||
            (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
-               if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+               s64 old_pool = ci->i_layout.pool_id;
+               struct ceph_string *old_ns;
+
+               ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
+               old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+                                       lockdep_is_held(&ci->i_ceph_lock));
+               rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
+
+               if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
                        ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
-               ci->i_layout = info->layout;
-               ci->i_pool_ns_len = iinfo->pool_ns_len;
+
+               pool_ns = old_ns;
 
                queue_trunc = ceph_fill_file_size(inode, issued,
                                        le32_to_cpu(info->truncate_seq),
@@ -985,6 +1008,7 @@ out:
                ceph_put_cap(mdsc, new_cap);
        if (xattr_blob)
                ceph_buffer_put(xattr_blob);
+       ceph_put_string(pool_ns);
        return err;
 }
 
@@ -1018,7 +1042,7 @@ static void update_dentry_lease(struct dentry *dentry,
                goto out_unlock;
 
        if (di->lease_gen == session->s_cap_gen &&
-           time_before(ttl, dentry->d_time))
+           time_before(ttl, di->time))
                goto out_unlock;  /* we already have a newer lease. */
 
        if (di->lease_session && di->lease_session != session)
@@ -1032,7 +1056,7 @@ static void update_dentry_lease(struct dentry *dentry,
        di->lease_seq = le32_to_cpu(lease->seq);
        di->lease_renew_after = half_ttl;
        di->lease_renew_from = 0;
-       dentry->d_time = ttl;
+       di->time = ttl;
 out_unlock:
        spin_unlock(&dentry->d_lock);
        return;
index be6b165..7d752d5 100644 (file)
@@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 
        err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
        if (!err) {
-               l.stripe_unit = ceph_file_layout_su(ci->i_layout);
-               l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
-               l.object_size = ceph_file_layout_object_size(ci->i_layout);
-               l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+               l.stripe_unit = ci->i_layout.stripe_unit;
+               l.stripe_count = ci->i_layout.stripe_count;
+               l.object_size = ci->i_layout.object_size;
+               l.data_pool = ci->i_layout.pool_id;
                l.preferred_osd = (s32)-1;
                if (copy_to_user(arg, &l, sizeof(l)))
                        return -EFAULT;
@@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
        if (l.stripe_count)
                nl.stripe_count = l.stripe_count;
        else
-               nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+               nl.stripe_count = ci->i_layout.stripe_count;
        if (l.stripe_unit)
                nl.stripe_unit = l.stripe_unit;
        else
-               nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+               nl.stripe_unit = ci->i_layout.stripe_unit;
        if (l.object_size)
                nl.object_size = l.object_size;
        else
-               nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+               nl.object_size = ci->i_layout.object_size;
        if (l.data_pool)
                nl.data_pool = l.data_pool;
        else
-               nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+               nl.data_pool = ci->i_layout.pool_id;
 
        /* this is obsolete, and always -1 */
        nl.preferred_osd = le64_to_cpu(-1);
@@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        struct ceph_osd_client *osdc =
                &ceph_sb_to_client(inode->i_sb)->client->osdc;
        struct ceph_object_locator oloc;
-       struct ceph_object_id oid;
+       CEPH_DEFINE_OID_ONSTACK(oid);
        u64 len = 1, olen;
        u64 tmp;
        struct ceph_pg pgid;
@@ -202,8 +202,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
                return -EIO;
        }
        dl.file_offset -= dl.object_offset;
-       dl.object_size = ceph_file_layout_object_size(ci->i_layout);
-       dl.block_size = ceph_file_layout_su(ci->i_layout);
+       dl.object_size = ci->i_layout.object_size;
+       dl.block_size = ci->i_layout.stripe_unit;
 
        /* block_offset = object_offset % block_size */
        tmp = dl.object_offset;
@@ -212,10 +212,13 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
                 ceph_ino(inode), dl.object_no);
 
-       oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+       oloc.pool = ci->i_layout.pool_id;
+       oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
        ceph_oid_printf(&oid, "%s", dl.object_name);
 
        r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
+
+       ceph_oloc_destroy(&oloc);
        if (r < 0) {
                up_read(&osdc->lock);
                return r;
@@ -247,9 +250,8 @@ static long ceph_ioctl_lazyio(struct file *file)
 
        if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
                spin_lock(&ci->i_ceph_lock);
-               ci->i_nr_by_mode[fi->fmode]--;
                fi->fmode |= CEPH_FILE_MODE_LAZY;
-               ci->i_nr_by_mode[fi->fmode]++;
+               ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
                spin_unlock(&ci->i_ceph_lock);
                dout("ioctl_layzio: file %p marked lazy\n", file);
 
index 4e8678a..fa59a85 100644 (file)
@@ -48,7 +48,7 @@
 struct ceph_reconnect_state {
        int nr_caps;
        struct ceph_pagelist *pagelist;
-       bool flock;
+       unsigned msg_version;
 };
 
 static void __wake_requests(struct ceph_mds_client *mdsc,
@@ -100,12 +100,15 @@ static int parse_reply_info_in(void **p, void *end,
        } else
                info->inline_version = CEPH_INLINE_NONE;
 
+       info->pool_ns_len = 0;
+       info->pool_ns_data = NULL;
        if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
                ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
-               ceph_decode_need(p, end, info->pool_ns_len, bad);
-               *p += info->pool_ns_len;
-       } else {
-               info->pool_ns_len = 0;
+               if (info->pool_ns_len > 0) {
+                       ceph_decode_need(p, end, info->pool_ns_len, bad);
+                       info->pool_ns_data = *p;
+                       *p += info->pool_ns_len;
+               }
        }
 
        return 0;
@@ -469,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        s->s_cap_iterator = NULL;
        INIT_LIST_HEAD(&s->s_cap_releases);
        INIT_LIST_HEAD(&s->s_cap_flushing);
-       INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
 
        dout("register_session mds%d\n", mds);
        if (mds >= mdsc->max_sessions) {
@@ -1145,19 +1147,17 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                    ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                        invalidate = true;
 
-               while (true) {
-                       struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
-                       if (!n)
-                               break;
-                       cf = rb_entry(n, struct ceph_cap_flush, i_node);
-                       rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
-                       list_add(&cf->list, &to_remove);
+               while (!list_empty(&ci->i_cap_flush_list)) {
+                       cf = list_first_entry(&ci->i_cap_flush_list,
+                                             struct ceph_cap_flush, i_list);
+                       list_del(&cf->i_list);
+                       list_add(&cf->i_list, &to_remove);
                }
 
                spin_lock(&mdsc->cap_dirty_lock);
 
-               list_for_each_entry(cf, &to_remove, list)
-                       rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+               list_for_each_entry(cf, &to_remove, i_list)
+                       list_del(&cf->g_list);
 
                if (!list_empty(&ci->i_dirty_item)) {
                        pr_warn_ratelimited(
@@ -1181,7 +1181,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                spin_unlock(&mdsc->cap_dirty_lock);
 
                if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
-                       list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+                       list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
                        ci->i_prealloc_cap_flush = NULL;
                }
        }
@@ -1189,8 +1189,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        while (!list_empty(&to_remove)) {
                struct ceph_cap_flush *cf;
                cf = list_first_entry(&to_remove,
-                                     struct ceph_cap_flush, list);
-               list_del(&cf->list);
+                                     struct ceph_cap_flush, i_list);
+               list_del(&cf->i_list);
                ceph_free_cap_flush(cf);
        }
 
@@ -1212,6 +1212,8 @@ static void remove_session_caps(struct ceph_mds_session *session)
        dout("remove_session_caps on %p\n", session);
        iterate_session_caps(session, remove_session_caps_cb, fsc);
 
+       wake_up_all(&fsc->mdsc->cap_flushing_wq);
+
        spin_lock(&session->s_cap_lock);
        if (session->s_nr_caps > 0) {
                struct inode *inode;
@@ -1478,35 +1480,21 @@ static int trim_caps(struct ceph_mds_client *mdsc,
        return 0;
 }
 
-static int check_capsnap_flush(struct ceph_inode_info *ci,
-                              u64 want_snap_seq)
-{
-       int ret = 1;
-       spin_lock(&ci->i_ceph_lock);
-       if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
-               struct ceph_cap_snap *capsnap =
-                       list_first_entry(&ci->i_cap_snaps,
-                                        struct ceph_cap_snap, ci_item);
-               ret = capsnap->follows >= want_snap_seq;
-       }
-       spin_unlock(&ci->i_ceph_lock);
-       return ret;
-}
-
 static int check_caps_flush(struct ceph_mds_client *mdsc,
                            u64 want_flush_tid)
 {
-       struct rb_node *n;
-       struct ceph_cap_flush *cf;
        int ret = 1;
 
        spin_lock(&mdsc->cap_dirty_lock);
-       n = rb_first(&mdsc->cap_flush_tree);
-       cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
-       if (cf && cf->tid <= want_flush_tid) {
-               dout("check_caps_flush still flushing tid %llu <= %llu\n",
-                    cf->tid, want_flush_tid);
-               ret = 0;
+       if (!list_empty(&mdsc->cap_flush_list)) {
+               struct ceph_cap_flush *cf =
+                       list_first_entry(&mdsc->cap_flush_list,
+                                        struct ceph_cap_flush, g_list);
+               if (cf->tid <= want_flush_tid) {
+                       dout("check_caps_flush still flushing tid "
+                            "%llu <= %llu\n", cf->tid, want_flush_tid);
+                       ret = 0;
+               }
        }
        spin_unlock(&mdsc->cap_dirty_lock);
        return ret;
@@ -1518,54 +1506,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
  * returns true if we've flushed through want_flush_tid
  */
 static void wait_caps_flush(struct ceph_mds_client *mdsc,
-                           u64 want_flush_tid, u64 want_snap_seq)
+                           u64 want_flush_tid)
 {
-       int mds;
-
-       dout("check_caps_flush want %llu snap want %llu\n",
-            want_flush_tid, want_snap_seq);
-       mutex_lock(&mdsc->mutex);
-       for (mds = 0; mds < mdsc->max_sessions; ) {
-               struct ceph_mds_session *session = mdsc->sessions[mds];
-               struct inode *inode = NULL;
-
-               if (!session) {
-                       mds++;
-                       continue;
-               }
-               get_session(session);
-               mutex_unlock(&mdsc->mutex);
-
-               mutex_lock(&session->s_mutex);
-               if (!list_empty(&session->s_cap_snaps_flushing)) {
-                       struct ceph_cap_snap *capsnap =
-                               list_first_entry(&session->s_cap_snaps_flushing,
-                                                struct ceph_cap_snap,
-                                                flushing_item);
-                       struct ceph_inode_info *ci = capsnap->ci;
-                       if (!check_capsnap_flush(ci, want_snap_seq)) {
-                               dout("check_cap_flush still flushing snap %p "
-                                    "follows %lld <= %lld to mds%d\n",
-                                    &ci->vfs_inode, capsnap->follows,
-                                    want_snap_seq, mds);
-                               inode = igrab(&ci->vfs_inode);
-                       }
-               }
-               mutex_unlock(&session->s_mutex);
-               ceph_put_mds_session(session);
-
-               if (inode) {
-                       wait_event(mdsc->cap_flushing_wq,
-                                  check_capsnap_flush(ceph_inode(inode),
-                                                      want_snap_seq));
-                       iput(inode);
-               } else {
-                       mds++;
-               }
-
-               mutex_lock(&mdsc->mutex);
-       }
-       mutex_unlock(&mdsc->mutex);
+       dout("check_caps_flush want %llu\n", want_flush_tid);
 
        wait_event(mdsc->cap_flushing_wq,
                   check_caps_flush(mdsc, want_flush_tid));
@@ -2163,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
        mds = __choose_mds(mdsc, req);
        if (mds < 0 ||
            ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+               if (mdsc->mdsmap_err) {
+                       err = mdsc->mdsmap_err;
+                       dout("do_request mdsmap err %d\n", err);
+                       goto finish;
+               }
                dout("do_request no mds or not active, waiting for map\n");
                list_add(&req->r_wait, &mdsc->waiting_for_map);
                goto out;
@@ -2292,14 +2240,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
 
-       /* deny access to directories with pool_ns layouts */
-       if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
-           ceph_inode(req->r_inode)->i_pool_ns_len)
-               return -EIO;
-       if (req->r_locked_dir &&
-           ceph_inode(req->r_locked_dir)->i_pool_ns_len)
-               return -EIO;
-
        /* issue */
        mutex_lock(&mdsc->mutex);
        __register_request(mdsc, req, dir);
@@ -2791,13 +2731,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                struct ceph_mds_cap_reconnect v2;
                struct ceph_mds_cap_reconnect_v1 v1;
        } rec;
-       size_t reclen;
        struct ceph_inode_info *ci;
        struct ceph_reconnect_state *recon_state = arg;
        struct ceph_pagelist *pagelist = recon_state->pagelist;
        char *path;
        int pathlen, err;
        u64 pathbase;
+       u64 snap_follows;
        struct dentry *dentry;
 
        ci = cap->ci;
@@ -2820,9 +2760,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                path = NULL;
                pathlen = 0;
        }
-       err = ceph_pagelist_encode_string(pagelist, path, pathlen);
-       if (err)
-               goto out_free;
 
        spin_lock(&ci->i_ceph_lock);
        cap->seq = 0;        /* reset cap seq */
@@ -2830,14 +2767,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        cap->mseq = 0;       /* and migrate_seq */
        cap->cap_gen = cap->session->s_cap_gen;
 
-       if (recon_state->flock) {
+       if (recon_state->msg_version >= 2) {
                rec.v2.cap_id = cpu_to_le64(cap->cap_id);
                rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
                rec.v2.issued = cpu_to_le32(cap->issued);
                rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
                rec.v2.pathbase = cpu_to_le64(pathbase);
                rec.v2.flock_len = 0;
-               reclen = sizeof(rec.v2);
        } else {
                rec.v1.cap_id = cpu_to_le64(cap->cap_id);
                rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -2847,13 +2783,23 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
                rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
                rec.v1.pathbase = cpu_to_le64(pathbase);
-               reclen = sizeof(rec.v1);
+       }
+
+       if (list_empty(&ci->i_cap_snaps)) {
+               snap_follows = 0;
+       } else {
+               struct ceph_cap_snap *capsnap =
+                       list_first_entry(&ci->i_cap_snaps,
+                                        struct ceph_cap_snap, ci_item);
+               snap_follows = capsnap->follows;
        }
        spin_unlock(&ci->i_ceph_lock);
 
-       if (recon_state->flock) {
+       if (recon_state->msg_version >= 2) {
                int num_fcntl_locks, num_flock_locks;
                struct ceph_filelock *flocks;
+               size_t struct_len, total_len = 0;
+               u8 struct_v = 0;
 
 encode_again:
                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
@@ -2872,20 +2818,51 @@ encode_again:
                                goto encode_again;
                        goto out_free;
                }
+
+               if (recon_state->msg_version >= 3) {
+                       /* version, compat_version and struct_len */
+                       total_len = 2 * sizeof(u8) + sizeof(u32);
+                       struct_v = 2;
+               }
                /*
                 * number of encoded locks is stable, so copy to pagelist
                 */
-               rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
-                                   (num_fcntl_locks+num_flock_locks) *
-                                   sizeof(struct ceph_filelock));
-               err = ceph_pagelist_append(pagelist, &rec, reclen);
-               if (!err)
-                       err = ceph_locks_to_pagelist(flocks, pagelist,
-                                                    num_fcntl_locks,
-                                                    num_flock_locks);
+               struct_len = 2 * sizeof(u32) +
+                           (num_fcntl_locks + num_flock_locks) *
+                           sizeof(struct ceph_filelock);
+               rec.v2.flock_len = cpu_to_le32(struct_len);
+
+               struct_len += sizeof(rec.v2);
+               struct_len += sizeof(u32) + pathlen;
+
+               if (struct_v >= 2)
+                       struct_len += sizeof(u64); /* snap_follows */
+
+               total_len += struct_len;
+               err = ceph_pagelist_reserve(pagelist, total_len);
+
+               if (!err) {
+                       if (recon_state->msg_version >= 3) {
+                               ceph_pagelist_encode_8(pagelist, struct_v);
+                               ceph_pagelist_encode_8(pagelist, 1);
+                               ceph_pagelist_encode_32(pagelist, struct_len);
+                       }
+                       ceph_pagelist_encode_string(pagelist, path, pathlen);
+                       ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+                       ceph_locks_to_pagelist(flocks, pagelist,
+                                              num_fcntl_locks,
+                                              num_flock_locks);
+                       if (struct_v >= 2)
+                               ceph_pagelist_encode_64(pagelist, snap_follows);
+               }
                kfree(flocks);
        } else {
-               err = ceph_pagelist_append(pagelist, &rec, reclen);
+               size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
+               err = ceph_pagelist_reserve(pagelist, size);
+               if (!err) {
+                       ceph_pagelist_encode_string(pagelist, path, pathlen);
+                       ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+               }
        }
 
        recon_state->nr_caps++;
@@ -2976,7 +2953,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 
        recon_state.nr_caps = 0;
        recon_state.pagelist = pagelist;
-       recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+       if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+               recon_state.msg_version = 3;
+       else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
+               recon_state.msg_version = 2;
+       else
+               recon_state.msg_version = 1;
        err = iterate_session_caps(session, encode_caps_cb, &recon_state);
        if (err < 0)
                goto fail;
@@ -3005,8 +2987,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
                        goto fail;
        }
 
-       if (recon_state.flock)
-               reply->hdr.version = cpu_to_le16(2);
+       reply->hdr.version = cpu_to_le16(recon_state.msg_version);
 
        /* raced with cap release? */
        if (s_nr_caps != recon_state.nr_caps) {
@@ -3231,7 +3212,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                                msecs_to_jiffies(le32_to_cpu(h->duration_ms));
 
                        di->lease_seq = seq;
-                       dentry->d_time = di->lease_renew_from + duration;
+                       di->time = di->lease_renew_from + duration;
                        di->lease_renew_after = di->lease_renew_from +
                                (duration >> 1);
                        di->lease_renew_from = 0;
@@ -3296,47 +3277,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
        ceph_con_send(&session->s_con, msg);
 }
 
-/*
- * Preemptively release a lease we expect to invalidate anyway.
- * Pass @inode always, @dentry is optional.
- */
-void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
-                            struct dentry *dentry)
-{
-       struct ceph_dentry_info *di;
-       struct ceph_mds_session *session;
-       u32 seq;
-
-       BUG_ON(inode == NULL);
-       BUG_ON(dentry == NULL);
-
-       /* is dentry lease valid? */
-       spin_lock(&dentry->d_lock);
-       di = ceph_dentry(dentry);
-       if (!di || !di->lease_session ||
-           di->lease_session->s_mds < 0 ||
-           di->lease_gen != di->lease_session->s_cap_gen ||
-           !time_before(jiffies, dentry->d_time)) {
-               dout("lease_release inode %p dentry %p -- "
-                    "no lease\n",
-                    inode, dentry);
-               spin_unlock(&dentry->d_lock);
-               return;
-       }
-
-       /* we do have a lease on this dentry; note mds and seq */
-       session = ceph_get_mds_session(di->lease_session);
-       seq = di->lease_seq;
-       __ceph_mdsc_drop_dentry_lease(dentry);
-       spin_unlock(&dentry->d_lock);
-
-       dout("lease_release inode %p dentry %p to mds%d\n",
-            inode, dentry, session->s_mds);
-       ceph_mdsc_lease_send_msg(session, inode, dentry,
-                                CEPH_MDS_LEASE_RELEASE, seq);
-       ceph_put_mds_session(session);
-}
-
 /*
  * drop all leases (and dentry refs) in preparation for umount
  */
@@ -3470,7 +3410,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        INIT_LIST_HEAD(&mdsc->snap_flush_list);
        spin_lock_init(&mdsc->snap_flush_lock);
        mdsc->last_cap_flush_tid = 1;
-       mdsc->cap_flush_tree = RB_ROOT;
+       INIT_LIST_HEAD(&mdsc->cap_flush_list);
        INIT_LIST_HEAD(&mdsc->cap_dirty);
        INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
        mdsc->num_cap_flushing = 0;
@@ -3585,7 +3525,7 @@ restart:
 
 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
-       u64 want_tid, want_flush, want_snap;
+       u64 want_tid, want_flush;
 
        if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return;
@@ -3598,17 +3538,19 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
        ceph_flush_dirty_caps(mdsc);
        spin_lock(&mdsc->cap_dirty_lock);
        want_flush = mdsc->last_cap_flush_tid;
+       if (!list_empty(&mdsc->cap_flush_list)) {
+               struct ceph_cap_flush *cf =
+                       list_last_entry(&mdsc->cap_flush_list,
+                                       struct ceph_cap_flush, g_list);
+               cf->wake = true;
+       }
        spin_unlock(&mdsc->cap_dirty_lock);
 
-       down_read(&mdsc->snap_rwsem);
-       want_snap = mdsc->last_snap_seq;
-       up_read(&mdsc->snap_rwsem);
-
-       dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
-            want_tid, want_flush, want_snap);
+       dout("sync want tid %lld flush_seq %lld\n",
+            want_tid, want_flush);
 
        wait_unsafe_requests(mdsc, want_tid);
-       wait_caps_flush(mdsc, want_flush, want_snap);
+       wait_caps_flush(mdsc, want_flush);
 }
 
 /*
@@ -3729,11 +3671,86 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
        dout("mdsc_destroy %p done\n", mdsc);
 }
 
+void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+       struct ceph_fs_client *fsc = mdsc->fsc;
+       const char *mds_namespace = fsc->mount_options->mds_namespace;
+       void *p = msg->front.iov_base;
+       void *end = p + msg->front.iov_len;
+       u32 epoch;
+       u32 map_len;
+       u32 num_fs;
+       u32 mount_fscid = (u32)-1;
+       u8 struct_v, struct_cv;
+       int err = -EINVAL;
+
+       ceph_decode_need(&p, end, sizeof(u32), bad);
+       epoch = ceph_decode_32(&p);
+
+       dout("handle_fsmap epoch %u\n", epoch);
+
+       ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+       struct_v = ceph_decode_8(&p);
+       struct_cv = ceph_decode_8(&p);
+       map_len = ceph_decode_32(&p);
+
+       ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
+       p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
+
+       num_fs = ceph_decode_32(&p);
+       while (num_fs-- > 0) {
+               void *info_p, *info_end;
+               u32 info_len;
+               u8 info_v, info_cv;
+               u32 fscid, namelen;
+
+               ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+               info_v = ceph_decode_8(&p);
+               info_cv = ceph_decode_8(&p);
+               info_len = ceph_decode_32(&p);
+               ceph_decode_need(&p, end, info_len, bad);
+               info_p = p;
+               info_end = p + info_len;
+               p = info_end;
+
+               ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
+               fscid = ceph_decode_32(&info_p);
+               namelen = ceph_decode_32(&info_p);
+               ceph_decode_need(&info_p, info_end, namelen, bad);
+
+               if (mds_namespace &&
+                   strlen(mds_namespace) == namelen &&
+                   !strncmp(mds_namespace, (char *)info_p, namelen)) {
+                       mount_fscid = fscid;
+                       break;
+               }
+       }
+
+       ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
+       if (mount_fscid != (u32)-1) {
+               fsc->client->monc.fs_cluster_id = mount_fscid;
+               ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+                                  0, true);
+               ceph_monc_renew_subs(&fsc->client->monc);
+       } else {
+               err = -ENOENT;
+               goto err_out;
+       }
+       return;
+bad:
+       pr_err("error decoding fsmap\n");
+err_out:
+       mutex_lock(&mdsc->mutex);
+       mdsc->mdsmap_err = -ENOENT;
+       __wake_requests(mdsc, &mdsc->waiting_for_map);
+       mutex_unlock(&mdsc->mutex);
+       return;
+}
 
 /*
  * handle mds map update.
  */
-void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 {
        u32 epoch;
        u32 maplen;
@@ -3840,7 +3857,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 
        switch (type) {
        case CEPH_MSG_MDS_MAP:
-               ceph_mdsc_handle_map(mdsc, msg);
+               ceph_mdsc_handle_mdsmap(mdsc, msg);
+               break;
+       case CEPH_MSG_FS_MAP_USER:
+               ceph_mdsc_handle_fsmap(mdsc, msg);
                break;
        case CEPH_MSG_CLIENT_SESSION:
                handle_session(s, msg);
index e7d38aa..6b36797 100644 (file)
@@ -45,6 +45,7 @@ struct ceph_mds_reply_info_in {
        u32 inline_len;
        char *inline_data;
        u32 pool_ns_len;
+       char *pool_ns_data;
 };
 
 struct ceph_mds_reply_dir_entry {
@@ -151,7 +152,6 @@ struct ceph_mds_session {
 
        /* protected by mutex */
        struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
-       struct list_head  s_cap_snaps_flushing;
        unsigned long     s_renew_requested; /* last time we sent a renew req */
        u64               s_renew_seq;
 
@@ -275,8 +275,10 @@ struct ceph_mds_request {
 
 struct ceph_pool_perm {
        struct rb_node node;
-       u32 pool;
        int perm;
+       s64 pool;
+       size_t pool_ns_len;
+       char pool_ns[];
 };
 
 /*
@@ -290,6 +292,7 @@ struct ceph_mds_client {
        struct completion       safe_umount_waiters;
        wait_queue_head_t       session_close_wq;
        struct list_head        waiting_for_map;
+       int                     mdsmap_err;
 
        struct ceph_mds_session **sessions;    /* NULL for mds if no session */
        atomic_t                num_sessions;
@@ -321,7 +324,7 @@ struct ceph_mds_client {
        spinlock_t       snap_flush_lock;
 
        u64               last_cap_flush_tid;
-       struct rb_root    cap_flush_tree;
+       struct list_head  cap_flush_list;
        struct list_head  cap_dirty;        /* inodes with dirty caps */
        struct list_head  cap_dirty_migrating; /* ...that are migration... */
        int               num_cap_flushing; /* # caps we are flushing */
@@ -382,10 +385,6 @@ extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 
-extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
-                                   struct inode *inode,
-                                   struct dentry *dn);
-
 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
                                           struct inode *dir);
@@ -420,8 +419,10 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
                                     struct dentry *dentry, char action,
                                     u32 seq);
 
-extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
-                                struct ceph_msg *msg);
+extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
+                                   struct ceph_msg *msg);
+extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
+                                  struct ceph_msg *msg);
 
 extern struct ceph_mds_session *
 ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
index 9caaa7f..9ff5219 100644 (file)
@@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
        ihold(inode);
 
        atomic_set(&capsnap->nref, 1);
-       capsnap->ci = ci;
        INIT_LIST_HEAD(&capsnap->ci_item);
-       INIT_LIST_HEAD(&capsnap->flushing_item);
 
        capsnap->follows = old_snapc->seq;
        capsnap->issued = __ceph_caps_issued(ci, NULL);
@@ -551,7 +549,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
        ci->i_wrbuffer_ref_head = 0;
        capsnap->context = old_snapc;
        list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
-       old_snapc = NULL;
 
        if (used & CEPH_CAP_FILE_WR) {
                dout("queue_cap_snap %p cap_snap %p snapc %p"
@@ -563,6 +560,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                __ceph_finish_cap_snap(ci, capsnap);
        }
        capsnap = NULL;
+       old_snapc = NULL;
 
 update_snapc:
        if (ci->i_head_snapc) {
@@ -603,6 +601,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                     capsnap->dirty_pages);
                return 0;
        }
+
+       ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
        dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
             inode, capsnap, capsnap->context,
             capsnap->context->seq, ceph_cap_string(capsnap->dirty),
@@ -799,9 +799,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
                inode = &ci->vfs_inode;
                ihold(inode);
                spin_unlock(&mdsc->snap_flush_lock);
-               spin_lock(&ci->i_ceph_lock);
-               __ceph_flush_snaps(ci, &session, 0);
-               spin_unlock(&ci->i_ceph_lock);
+               ceph_flush_snaps(ci, &session);
                iput(inode);
                spin_lock(&mdsc->snap_flush_lock);
        }
index 91e0248..e247f6f 100644 (file)
@@ -108,7 +108,6 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
  * mount options
  */
 enum {
-       Opt_mds_namespace,
        Opt_wsize,
        Opt_rsize,
        Opt_rasize,
@@ -121,6 +120,7 @@ enum {
        Opt_last_int,
        /* int args above */
        Opt_snapdirname,
+       Opt_mds_namespace,
        Opt_last_string,
        /* string args above */
        Opt_dirstat,
@@ -144,7 +144,6 @@ enum {
 };
 
 static match_table_t fsopt_tokens = {
-       {Opt_mds_namespace, "mds_namespace=%d"},
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
        {Opt_rasize, "rasize=%d"},
@@ -156,6 +155,7 @@ static match_table_t fsopt_tokens = {
        {Opt_congestion_kb, "write_congestion_kb=%d"},
        /* int args above */
        {Opt_snapdirname, "snapdirname=%s"},
+       {Opt_mds_namespace, "mds_namespace=%s"},
        /* string args above */
        {Opt_dirstat, "dirstat"},
        {Opt_nodirstat, "nodirstat"},
@@ -212,11 +212,14 @@ static int parse_fsopt_token(char *c, void *private)
                if (!fsopt->snapdir_name)
                        return -ENOMEM;
                break;
-
-               /* misc */
        case Opt_mds_namespace:
-               fsopt->mds_namespace = intval;
+               fsopt->mds_namespace = kstrndup(argstr[0].from,
+                                               argstr[0].to-argstr[0].from,
+                                               GFP_KERNEL);
+               if (!fsopt->mds_namespace)
+                       return -ENOMEM;
                break;
+               /* misc */
        case Opt_wsize:
                fsopt->wsize = intval;
                break;
@@ -302,6 +305,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
 {
        dout("destroy_mount_options %p\n", args);
        kfree(args->snapdir_name);
+       kfree(args->mds_namespace);
        kfree(args->server_path);
        kfree(args);
 }
@@ -331,6 +335,9 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
                return ret;
 
        ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+       if (ret)
+               return ret;
+       ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
        if (ret)
                return ret;
 
@@ -376,7 +383,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        fsopt->congestion_kb = default_congestion_kb();
-       fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
 
        /*
         * Distinguish the server list from the path in "dev_name".
@@ -469,8 +475,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",noacl");
 #endif
 
-       if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
-               seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
+       if (fsopt->mds_namespace)
+               seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace);
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -509,9 +515,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 
        switch (type) {
        case CEPH_MSG_MDS_MAP:
-               ceph_mdsc_handle_map(fsc->mdsc, msg);
+               ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
+               return 0;
+       case CEPH_MSG_FS_MAP_USER:
+               ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
                return 0;
-
        default:
                return -1;
        }
@@ -543,8 +551,14 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-       fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
-       ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
+
+       if (fsopt->mds_namespace == NULL) {
+               ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+                                  0, true);
+       } else {
+               ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
+                                  0, false);
+       }
 
        fsc->mount_options = fsopt;
 
@@ -672,8 +686,8 @@ static int __init init_caches(void)
        if (ceph_dentry_cachep == NULL)
                goto bad_dentry;
 
-       ceph_file_cachep = KMEM_CACHE(ceph_file_info,
-                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+       ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
+
        if (ceph_file_cachep == NULL)
                goto bad_file;
 
@@ -731,6 +745,7 @@ static const struct super_operations ceph_super_ops = {
        .destroy_inode  = ceph_destroy_inode,
        .write_inode    = ceph_write_inode,
        .drop_inode     = ceph_drop_inode,
+       .evict_inode    = ceph_evict_inode,
        .sync_fs        = ceph_sync_fs,
        .put_super      = ceph_put_super,
        .show_options   = ceph_show_options,
index 0168b49..3e3fa91 100644 (file)
@@ -62,7 +62,6 @@ struct ceph_mount_options {
        int cap_release_safety;
        int max_readdir;       /* max readdir result (entires) */
        int max_readdir_bytes; /* max readdir result (bytes) */
-       int mds_namespace;
 
        /*
         * everything above this point can be memcmp'd; everything below
@@ -70,6 +69,7 @@ struct ceph_mount_options {
         */
 
        char *snapdir_name;   /* default ".snap" */
+       char *mds_namespace;  /* default NULL */
        char *server_path;    /* default  "/" */
 };
 
@@ -147,6 +147,14 @@ struct ceph_cap {
 #define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
 #define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
 
+struct ceph_cap_flush {
+       u64 tid;
+       int caps; /* 0 means capsnap */
+       bool wake; /* wake up flush waiters when finish ? */
+       struct list_head g_list; // global
+       struct list_head i_list; // per inode
+};
+
 /*
  * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
  * we first complete any in-process sync writes and writeback any dirty
@@ -154,10 +162,11 @@ struct ceph_cap {
  */
 struct ceph_cap_snap {
        atomic_t nref;
-       struct ceph_inode_info *ci;
-       struct list_head ci_item, flushing_item;
+       struct list_head ci_item;
+
+       struct ceph_cap_flush cap_flush;
 
-       u64 follows, flush_tid;
+       u64 follows;
        int issued, dirty;
        struct ceph_snap_context *context;
 
@@ -186,16 +195,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
        }
 }
 
-struct ceph_cap_flush {
-       u64 tid;
-       int caps;
-       struct rb_node g_node; // global
-       union {
-               struct rb_node i_node; // inode
-               struct list_head list;
-       };
-};
-
 /*
  * The frag tree describes how a directory is fragmented, potentially across
  * multiple metadata servers.  It is also used to indicate points where
@@ -246,7 +245,7 @@ struct ceph_dentry_info {
        unsigned long lease_renew_after, lease_renew_from;
        struct list_head lru;
        struct dentry *dentry;
-       u64 time;
+       unsigned long time;
        u64 offset;
 };
 
@@ -287,7 +286,6 @@ struct ceph_inode_info {
 
        struct ceph_dir_layout i_dir_layout;
        struct ceph_file_layout i_layout;
-       size_t i_pool_ns_len;
        char *i_symlink;
 
        /* for dirs */
@@ -311,7 +309,7 @@ struct ceph_inode_info {
         * overlapping, pipelined cap flushes to the mds.  we can probably
         * reduce the tid to 8 bits if we're concerned about inode size. */
        struct ceph_cap_flush *i_prealloc_cap_flush;
-       struct rb_root i_cap_flush_tree;
+       struct list_head i_cap_flush_list;
        wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
        unsigned long i_hold_caps_min; /* jiffies */
        unsigned long i_hold_caps_max; /* jiffies */
@@ -322,7 +320,7 @@ struct ceph_inode_info {
                                                    dirty|flushing caps */
        unsigned i_snap_caps;           /* cap bits for snapped files */
 
-       int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
+       int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
 
        struct mutex i_truncate_mutex;
        u32 i_truncate_seq;        /* last truncate to smaller size */
@@ -471,6 +469,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_WR         (1 << 6)  /* can write to pool */
 #define CEPH_I_SEC_INITED      (1 << 7)  /* security initialized */
 #define CEPH_I_CAP_DROPPED     (1 << 8)  /* caps were forcibly dropped */
+#define CEPH_I_KICK_FLUSH      (1 << 9)  /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS     (1 << 10) /* need flush snapss */
 
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
                                           long long release_count,
@@ -750,6 +750,7 @@ extern const struct inode_operations ceph_file_iops;
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
 extern void ceph_destroy_inode(struct inode *inode);
 extern int ceph_drop_inode(struct inode *inode);
+extern void ceph_evict_inode(struct inode *inode);
 
 extern struct inode *ceph_get_inode(struct super_block *sb,
                                    struct ceph_vino vino);
@@ -890,9 +891,8 @@ extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                                       struct ceph_snap_context *snapc);
-extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
-                              struct ceph_mds_session **psession,
-                              int again);
+extern void ceph_flush_snaps(struct ceph_inode_info *ci,
+                            struct ceph_mds_session **psession);
 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                            struct ceph_mds_session *session);
 extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -907,10 +907,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                         loff_t endoff, int *got, struct page **pinned_page);
 
 /* for counting open files by mode */
-static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
-{
-       ci->i_nr_by_mode[mode]++;
-}
+extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
 extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
 
 /* addr.c */
@@ -931,6 +928,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 extern int ceph_release(struct inode *inode, struct file *filp);
 extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
                                  char *data, size_t len);
+extern void ceph_sync_write_wait(struct inode *inode);
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct file_operations ceph_snapdir_fops;
index 4870b29..adc2318 100644 (file)
@@ -57,81 +57,88 @@ struct ceph_vxattr {
 
 static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
 {
-       size_t s;
-       char *p = (char *)&ci->i_layout;
-
-       for (s = 0; s < sizeof(ci->i_layout); s++, p++)
-               if (*p)
-                       return true;
-       return false;
+       struct ceph_file_layout *fl = &ci->i_layout;
+       return (fl->stripe_unit > 0 || fl->stripe_count > 0 ||
+               fl->object_size > 0 || fl->pool_id >= 0 ||
+               rcu_dereference_raw(fl->pool_ns) != NULL);
 }
 
 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
                                   size_t size)
 {
-       int ret;
        struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
        struct ceph_osd_client *osdc = &fsc->client->osdc;
-       s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+       struct ceph_string *pool_ns;
+       s64 pool = ci->i_layout.pool_id;
        const char *pool_name;
+       const char *ns_field = " pool_namespace=";
        char buf[128];
+       size_t len, total_len = 0;
+       int ret;
+
+       pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
 
        dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
        down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name) {
-               size_t len = strlen(pool_name);
-               ret = snprintf(buf, sizeof(buf),
-               "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
-               (unsigned long long)ceph_file_layout_su(ci->i_layout),
-               (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-               (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-               if (!size) {
-                       ret += len;
-               } else if (ret + len > size) {
-                       ret = -ERANGE;
-               } else {
-                       memcpy(val, buf, ret);
+               len = snprintf(buf, sizeof(buf),
+               "stripe_unit=%u stripe_count=%u object_size=%u pool=",
+               ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+               ci->i_layout.object_size);
+               total_len = len + strlen(pool_name);
+       } else {
+               len = snprintf(buf, sizeof(buf),
+               "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
+               ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+               ci->i_layout.object_size, (unsigned long long)pool);
+               total_len = len;
+       }
+
+       if (pool_ns)
+               total_len += strlen(ns_field) + pool_ns->len;
+
+       if (!size) {
+               ret = total_len;
+       } else if (total_len > size) {
+               ret = -ERANGE;
+       } else {
+               memcpy(val, buf, len);
+               ret = len;
+               if (pool_name) {
+                       len = strlen(pool_name);
                        memcpy(val + ret, pool_name, len);
                        ret += len;
                }
-       } else {
-               ret = snprintf(buf, sizeof(buf),
-               "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
-               (unsigned long long)ceph_file_layout_su(ci->i_layout),
-               (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-               (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
-               (unsigned long long)pool);
-               if (size) {
-                       if (ret <= size)
-                               memcpy(val, buf, ret);
-                       else
-                               ret = -ERANGE;
+               if (pool_ns) {
+                       len = strlen(ns_field);
+                       memcpy(val + ret, ns_field, len);
+                       ret += len;
+                       memcpy(val + ret, pool_ns->str, pool_ns->len);
+                       ret += pool_ns->len;
                }
        }
        up_read(&osdc->lock);
+       ceph_put_string(pool_ns);
        return ret;
 }
 
 static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
                                               char *val, size_t size)
 {
-       return snprintf(val, size, "%lld",
-                       (unsigned long long)ceph_file_layout_su(ci->i_layout));
+       return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
 }
 
 static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
                                                char *val, size_t size)
 {
-       return snprintf(val, size, "%lld",
-              (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+       return snprintf(val, size, "%u", ci->i_layout.stripe_count);
 }
 
 static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
                                               char *val, size_t size)
 {
-       return snprintf(val, size, "%lld",
-              (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+       return snprintf(val, size, "%u", ci->i_layout.object_size);
 }
 
 static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
@@ -140,7 +147,7 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
        int ret;
        struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
        struct ceph_osd_client *osdc = &fsc->client->osdc;
-       s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+       s64 pool = ci->i_layout.pool_id;
        const char *pool_name;
 
        down_read(&osdc->lock);
@@ -153,6 +160,18 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
        return ret;
 }
 
+static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+                                                 char *val, size_t size)
+{
+       int ret = 0;
+       struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+       if (ns) {
+               ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+               ceph_put_string(ns);
+       }
+       return ret;
+}
+
 /* directories */
 
 static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -241,6 +260,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
        XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
        XATTR_LAYOUT_FIELD(dir, layout, object_size),
        XATTR_LAYOUT_FIELD(dir, layout, pool),
+       XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
        XATTR_NAME_CEPH(dir, entries),
        XATTR_NAME_CEPH(dir, files),
        XATTR_NAME_CEPH(dir, subdirs),
@@ -268,6 +288,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
        XATTR_LAYOUT_FIELD(file, layout, stripe_count),
        XATTR_LAYOUT_FIELD(file, layout, object_size),
        XATTR_LAYOUT_FIELD(file, layout, pool),
+       XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
        { .name = NULL, 0 }     /* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;     /* total size of all names */
index 5dfc4f3..00235bf 100644 (file)
@@ -73,6 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
                }
        }
 
+       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
        ret = 1;
 out_release_op:
        op_release(new_op);
@@ -94,6 +95,9 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
        int ret;
 
+       if (time_before(jiffies, dentry->d_time))
+               return 1;
+
        if (flags & LOOKUP_RCU)
                return -ECHILD;
 
index 2e63e6d..28a0557 100644 (file)
@@ -262,7 +262,7 @@ int orangefs_getattr(struct vfsmount *mnt,
                     "orangefs_getattr: called on %s\n",
                     dentry->d_name.name);
 
-       ret = orangefs_inode_getattr(inode, 0, 1);
+       ret = orangefs_inode_getattr(inode, 0, 0);
        if (ret == 0) {
                generic_fillattr(inode, kstat);
 
@@ -384,7 +384,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
        if (!inode || !(inode->i_state & I_NEW))
                return inode;
 
-       error = orangefs_inode_getattr(inode, 1, 0);
+       error = orangefs_inode_getattr(inode, 1, 1);
        if (error) {
                iget_failed(inode);
                return ERR_PTR(error);
@@ -429,7 +429,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
        orangefs_set_inode(inode, ref);
        inode->i_ino = hash;    /* needed for stat etc */
 
-       error = orangefs_inode_getattr(inode, 1, 0);
+       error = orangefs_inode_getattr(inode, 1, 1);
        if (error)
                goto out_iput;
 
index 7e8dfa9..62c5259 100644 (file)
@@ -72,6 +72,8 @@ static int orangefs_create(struct inode *dir,
 
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
+       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+       ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 
        gossip_debug(GOSSIP_NAME_DEBUG,
                     "%s: dentry instantiated for %s\n",
@@ -181,6 +183,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
                goto out;
        }
 
+       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+
        inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
        if (IS_ERR(inode)) {
                gossip_debug(GOSSIP_NAME_DEBUG,
@@ -189,6 +193,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
                goto out;
        }
 
+       ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+
        gossip_debug(GOSSIP_NAME_DEBUG,
                     "%s:%s:%d "
                     "Found good inode [%lu] with count [%d]\n",
@@ -316,6 +322,8 @@ static int orangefs_symlink(struct inode *dir,
 
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
+       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+       ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 
        gossip_debug(GOSSIP_NAME_DEBUG,
                     "Inode (Symlink) %pU -> %s\n",
@@ -378,6 +386,8 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
+       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+       ORANGEFS_I(inode)->getattr_time = jiffies - 1;
 
        gossip_debug(GOSSIP_NAME_DEBUG,
                     "Inode (Directory) %pU -> %s\n",
@@ -408,6 +418,8 @@ static int orangefs_rename(struct inode *old_dir,
                     "orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
                     old_dentry, new_dentry, d_count(new_dentry));
 
+       ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1;
+
        new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
        if (!new_op)
                return -EINVAL;
index 4b6e132..633c07a 100644 (file)
@@ -246,6 +246,8 @@ struct orangefs_inode_s {
         * with this object
         */
        unsigned long pinode_flags;
+
+       unsigned long getattr_time;
 };
 
 #define P_ATIME_FLAG 0
@@ -527,7 +529,7 @@ int orangefs_inode_setxattr(struct inode *inode,
                         size_t size,
                         int flags);
 
-int orangefs_inode_getattr(struct inode *inode, int new, int size);
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
 
 int orangefs_inode_check_changed(struct inode *inode);
 
@@ -546,6 +548,8 @@ extern struct mutex request_mutex;
 extern int debug;
 extern int op_timeout_secs;
 extern int slot_timeout_secs;
+extern int dcache_timeout_msecs;
+extern int getattr_timeout_msecs;
 extern struct list_head orangefs_superblocks;
 extern spinlock_t orangefs_superblocks_lock;
 extern struct list_head orangefs_request_list;
index 6f072a8..e9fd575 100644 (file)
@@ -47,6 +47,8 @@ struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
 unsigned int kernel_mask_set_mod_init; /* implicitly false */
 int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
 int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+int dcache_timeout_msecs = 50;
+int getattr_timeout_msecs = 50;
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("ORANGEFS Development Team");
index 5c03113..375708c 100644 (file)
  *                     Slots are requested and waited for,
  *                     the wait times out after slot_timeout_secs.
  *
+ * What:               /sys/fs/orangefs/dcache_timeout_msecs
+ * Date:               Jul 2016
+ * Contact:            Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *                     Time lookup is valid in milliseconds.
+ *
+ * What:               /sys/fs/orangefs/getattr_timeout_msecs
+ * Date:               Jul 2016
+ * Contact:            Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *                     Time getattr is valid in milliseconds.
  *
  * What:               /sys/fs/orangefs/acache/...
  * Date:               Jun 2015
- * Contact:            Mike Marshall <hubcap@omnibond.com>
+ * Contact:            Martin Brandenburg <martin@omnibond.com>
  * Description:
  *                     Attribute cache configurable settings.
  *
@@ -117,6 +128,8 @@ struct orangefs_obj {
        int perf_history_size;
        int perf_time_interval_secs;
        int slot_timeout_secs;
+       int dcache_timeout_msecs;
+       int getattr_timeout_msecs;
 };
 
 struct acache_orangefs_obj {
@@ -658,6 +671,20 @@ static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
                                       "%d\n",
                                       slot_timeout_secs);
                        goto out;
+               } else if (!strcmp(orangefs_attr->attr.name,
+                                  "dcache_timeout_msecs")) {
+                       rc = scnprintf(buf,
+                                      PAGE_SIZE,
+                                      "%d\n",
+                                      dcache_timeout_msecs);
+                       goto out;
+               } else if (!strcmp(orangefs_attr->attr.name,
+                                  "getattr_timeout_msecs")) {
+                       rc = scnprintf(buf,
+                                      PAGE_SIZE,
+                                      "%d\n",
+                                      getattr_timeout_msecs);
+                       goto out;
                } else {
                        goto out;
                }
@@ -734,6 +761,12 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj,
        } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
                rc = kstrtoint(buf, 0, &slot_timeout_secs);
                goto out;
+       } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
+               rc = kstrtoint(buf, 0, &dcache_timeout_msecs);
+               goto out;
+       } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) {
+               rc = kstrtoint(buf, 0, &getattr_timeout_msecs);
+               goto out;
        } else {
                goto out;
        }
@@ -1361,6 +1394,12 @@ static struct orangefs_attribute op_timeout_secs_attribute =
 static struct orangefs_attribute slot_timeout_secs_attribute =
        __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
 
+static struct orangefs_attribute dcache_timeout_msecs_attribute =
+       __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store);
+
+static struct orangefs_attribute getattr_timeout_msecs_attribute =
+       __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
+
 static struct orangefs_attribute perf_counter_reset_attribute =
        __ATTR(perf_counter_reset,
               0664,
@@ -1382,6 +1421,8 @@ static struct orangefs_attribute perf_time_interval_secs_attribute =
 static struct attribute *orangefs_default_attrs[] = {
        &op_timeout_secs_attribute.attr,
        &slot_timeout_secs_attribute.attr,
+       &dcache_timeout_msecs_attribute.attr,
+       &getattr_timeout_msecs_attribute.attr,
        &perf_counter_reset_attribute.attr,
        &perf_history_size_attribute.attr,
        &perf_time_interval_secs_attribute.attr,
index c5fbc62..d13c729 100644 (file)
@@ -251,7 +251,7 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
        return 0;
 }
 
-int orangefs_inode_getattr(struct inode *inode, int new, int size)
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
 {
        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
        struct orangefs_kernel_op_s *new_op;
@@ -261,12 +261,16 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
        gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
            get_khandle_from_ino(inode));
 
+       if (!new && !bypass) {
+               if (time_before(jiffies, orangefs_inode->getattr_time))
+                       return 0;
+       }
+
        new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
        if (!new_op)
                return -ENOMEM;
        new_op->upcall.req.getattr.refn = orangefs_inode->refn;
-       new_op->upcall.req.getattr.mask = size ?
-           ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
+       new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
 
        ret = service_operation(new_op, __func__,
            get_interruptible_flag(inode));
@@ -287,20 +291,18 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
        case S_IFREG:
                inode->i_flags = orangefs_inode_flags(&new_op->
                    downcall.resp.getattr.attributes);
-               if (size) {
-                       inode_size = (loff_t)new_op->
-                           downcall.resp.getattr.attributes.size;
-                       rounded_up_size =
-                           (inode_size + (4096 - (inode_size % 4096)));
-                       inode->i_size = inode_size;
-                       orangefs_inode->blksize =
-                           new_op->downcall.resp.getattr.attributes.blksize;
-                       spin_lock(&inode->i_lock);
-                       inode->i_bytes = inode_size;
-                       inode->i_blocks =
-                           (unsigned long)(rounded_up_size / 512);
-                       spin_unlock(&inode->i_lock);
-               }
+               inode_size = (loff_t)new_op->
+                   downcall.resp.getattr.attributes.size;
+               rounded_up_size =
+                   (inode_size + (4096 - (inode_size % 4096)));
+               inode->i_size = inode_size;
+               orangefs_inode->blksize =
+                   new_op->downcall.resp.getattr.attributes.blksize;
+               spin_lock(&inode->i_lock);
+               inode->i_bytes = inode_size;
+               inode->i_blocks =
+                   (unsigned long)(rounded_up_size / 512);
+               spin_unlock(&inode->i_lock);
                break;
        case S_IFDIR:
                inode->i_size = PAGE_SIZE;
@@ -345,6 +347,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
        inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
            orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
 
+       orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000;
        ret = 0;
 out:
        op_release(new_op);
@@ -418,6 +421,7 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
                ClearMtimeFlag(orangefs_inode);
                ClearCtimeFlag(orangefs_inode);
                ClearModeFlag(orangefs_inode);
+               orangefs_inode->getattr_time = jiffies - 1;
        }
 
        return ret;
index 1efc6f8..3d7418c 100644 (file)
@@ -207,14 +207,6 @@ typedef __s64 ORANGEFS_offset;
         ORANGEFS_ATTR_SYS_DIRENT_COUNT         |       \
         ORANGEFS_ATTR_SYS_BLKSIZE)
 
-#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE            \
-       (ORANGEFS_ATTR_SYS_COMMON_ALL           |       \
-        ORANGEFS_ATTR_SYS_LNK_TARGET           |       \
-        ORANGEFS_ATTR_SYS_DFILE_COUNT          |       \
-        ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT  |       \
-        ORANGEFS_ATTR_SYS_DIRENT_COUNT         |       \
-        ORANGEFS_ATTR_SYS_BLKSIZE)
-
 #define ORANGEFS_XATTR_REPLACE 0x2
 #define ORANGEFS_XATTR_CREATE  0x1
 #define ORANGEFS_MAX_SERVER_ADDR_LEN 256
index 54643d1..2456397 100644 (file)
 
 #define ___OF_TABLE(cfg, name) _OF_TABLE_##cfg(name)
 #define __OF_TABLE(cfg, name)  ___OF_TABLE(cfg, name)
-#define OF_TABLE(cfg, name)    __OF_TABLE(config_enabled(cfg), name)
+#define OF_TABLE(cfg, name)    __OF_TABLE(IS_ENABLED(cfg), name)
 #define _OF_TABLE_0(name)
 #define _OF_TABLE_1(name)                                              \
        . = ALIGN(8);                                                   \
index 72dee12..63b8bd5 100644 (file)
@@ -657,6 +657,8 @@ struct edp_vsc_psr {
 #define EDP_VSC_PSR_UPDATE_RFB         (1<<1)
 #define EDP_VSC_PSR_CRC_VALUES_VALID   (1<<2)
 
+int drm_dp_psr_setup_time(const u8 psr_cap[EDP_PSR_RECEIVER_CAP_SIZE]);
+
 static inline int
 drm_dp_max_link_rate(const u8 dpcd[DP_RECEIVER_CAP_SIZE])
 {
index da0a524..540da51 100644 (file)
@@ -1,6 +1,5 @@
 /*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
+ * Copyright (C) 2015, 2016 ARM Ltd.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
-#ifndef __ASM_ARM_KVM_VGIC_H
-#define __ASM_ARM_KVM_VGIC_H
-
-#ifdef CONFIG_KVM_NEW_VGIC
-#include <kvm/vgic/vgic.h>
-#else
+#ifndef __KVM_ARM_VGIC_H
+#define __KVM_ARM_VGIC_H
 
 #include <linux/kernel.h>
 #include <linux/kvm.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
+#include <linux/list.h>
 
-#define VGIC_NR_IRQS_LEGACY    256
+#define VGIC_V3_MAX_CPUS       255
+#define VGIC_V2_MAX_CPUS       8
+#define VGIC_NR_IRQS_LEGACY     256
 #define VGIC_NR_SGIS           16
 #define VGIC_NR_PPIS           16
 #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
+#define VGIC_MAX_PRIVATE       (VGIC_NR_PRIVATE_IRQS - 1)
+#define VGIC_MAX_SPI           1019
+#define VGIC_MAX_RESERVED      1023
+#define VGIC_MIN_LPI           8192
 
-#define VGIC_V2_MAX_LRS                (1 << 6)
-#define VGIC_V3_MAX_LRS                16
-#define VGIC_MAX_IRQS          1024
-#define VGIC_V2_MAX_CPUS       8
-#define VGIC_V3_MAX_CPUS       255
+enum vgic_type {
+       VGIC_V2,                /* Good ol' GICv2 */
+       VGIC_V3,                /* New fancy GICv3 */
+};
 
-#if (VGIC_NR_IRQS_LEGACY & 31)
-#error "VGIC_NR_IRQS must be a multiple of 32"
-#endif
+/* same for all guests, as depending only on the _host's_ GIC model */
+struct vgic_global {
+       /* type of the host GIC */
+       enum vgic_type          type;
 
-#if (VGIC_NR_IRQS_LEGACY > VGIC_MAX_IRQS)
-#error "VGIC_NR_IRQS must be <= 1024"
-#endif
+       /* Physical address of vgic virtual cpu interface */
+       phys_addr_t             vcpu_base;
 
-/*
- * The GIC distributor registers describing interrupts have two parts:
- * - 32 per-CPU interrupts (SGI + PPI)
- * - a bunch of shared interrupts (SPI)
- */
-struct vgic_bitmap {
-       /*
-        * - One UL per VCPU for private interrupts (assumes UL is at
-        *   least 32 bits)
-        * - As many UL as necessary for shared interrupts.
-        *
-        * The private interrupts are accessed via the "private"
-        * field, one UL per vcpu (the state for vcpu n is in
-        * private[n]). The shared interrupts are accessed via the
-        * "shared" pointer (IRQn state is at bit n-32 in the bitmap).
-        */
-       unsigned long *private;
-       unsigned long *shared;
-};
+       /* virtual control interface mapping */
+       void __iomem            *vctrl_base;
 
-struct vgic_bytemap {
-       /*
-        * - 8 u32 per VCPU for private interrupts
-        * - As many u32 as necessary for shared interrupts.
-        *
-        * The private interrupts are accessed via the "private"
-        * field, (the state for vcpu n is in private[n*8] to
-        * private[n*8 + 7]). The shared interrupts are accessed via
-        * the "shared" pointer (IRQn state is at byte (n-32)%4 of the
-        * shared[(n-32)/4] word).
-        */
-       u32 *private;
-       u32 *shared;
-};
+       /* Number of implemented list registers */
+       int                     nr_lr;
 
-struct kvm_vcpu;
+       /* Maintenance IRQ number */
+       unsigned int            maint_irq;
 
-enum vgic_type {
-       VGIC_V2,                /* Good ol' GICv2 */
-       VGIC_V3,                /* New fancy GICv3 */
+       /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
+       int                     max_gic_vcpus;
+
+       /* Only needed for the legacy KVM_CREATE_IRQCHIP */
+       bool                    can_emulate_gicv2;
 };
 
-#define LR_STATE_PENDING       (1 << 0)
-#define LR_STATE_ACTIVE                (1 << 1)
-#define LR_STATE_MASK          (3 << 0)
-#define LR_EOI_INT             (1 << 2)
-#define LR_HW                  (1 << 3)
+extern struct vgic_global kvm_vgic_global_state;
 
-struct vgic_lr {
-       unsigned irq:10;
-       union {
-               unsigned hwirq:10;
-               unsigned source:3;
-       };
-       unsigned state:4;
-};
+#define VGIC_V2_MAX_LRS                (1 << 6)
+#define VGIC_V3_MAX_LRS                16
+#define VGIC_V3_LR_INDEX(lr)   (VGIC_V3_MAX_LRS - 1 - lr)
 
-struct vgic_vmcr {
-       u32     ctlr;
-       u32     abpr;
-       u32     bpr;
-       u32     pmr;
+enum vgic_irq_config {
+       VGIC_CONFIG_EDGE = 0,
+       VGIC_CONFIG_LEVEL
 };
 
-struct vgic_ops {
-       struct vgic_lr  (*get_lr)(const struct kvm_vcpu *, int);
-       void    (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
-       u64     (*get_elrsr)(const struct kvm_vcpu *vcpu);
-       u64     (*get_eisr)(const struct kvm_vcpu *vcpu);
-       void    (*clear_eisr)(struct kvm_vcpu *vcpu);
-       u32     (*get_interrupt_status)(const struct kvm_vcpu *vcpu);
-       void    (*enable_underflow)(struct kvm_vcpu *vcpu);
-       void    (*disable_underflow)(struct kvm_vcpu *vcpu);
-       void    (*get_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-       void    (*set_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-       void    (*enable)(struct kvm_vcpu *vcpu);
+struct vgic_irq {
+       spinlock_t irq_lock;            /* Protects the content of the struct */
+       struct list_head lpi_list;      /* Used to link all LPIs together */
+       struct list_head ap_list;
+
+       struct kvm_vcpu *vcpu;          /* SGIs and PPIs: The VCPU
+                                        * SPIs and LPIs: The VCPU whose ap_list
+                                        * this is queued on.
+                                        */
+
+       struct kvm_vcpu *target_vcpu;   /* The VCPU that this interrupt should
+                                        * be sent to, as a result of the
+                                        * targets reg (v2) or the
+                                        * affinity reg (v3).
+                                        */
+
+       u32 intid;                      /* Guest visible INTID */
+       bool pending;
+       bool line_level;                /* Level only */
+       bool soft_pending;              /* Level only */
+       bool active;                    /* not used for LPIs */
+       bool enabled;
+       bool hw;                        /* Tied to HW IRQ */
+       struct kref refcount;           /* Used for LPIs */
+       u32 hwintid;                    /* HW INTID number */
+       union {
+               u8 targets;                     /* GICv2 target VCPUs mask */
+               u32 mpidr;                      /* GICv3 target VCPU */
+       };
+       u8 source;                      /* GICv2 SGIs only */
+       u8 priority;
+       enum vgic_irq_config config;    /* Level or edge */
 };
 
-struct vgic_params {
-       /* vgic type */
-       enum vgic_type  type;
-       /* Physical address of vgic virtual cpu interface */
-       phys_addr_t     vcpu_base;
-       /* Number of list registers */
-       u32             nr_lr;
-       /* Interrupt number */
-       unsigned int    maint_irq;
-       /* Virtual control interface base address */
-       void __iomem    *vctrl_base;
-       int             max_gic_vcpus;
-       /* Only needed for the legacy KVM_CREATE_IRQCHIP */
-       bool            can_emulate_gicv2;
-};
+struct vgic_register_region;
+struct vgic_its;
 
-struct vgic_vm_ops {
-       bool    (*queue_sgi)(struct kvm_vcpu *, int irq);
-       void    (*add_sgi_source)(struct kvm_vcpu *, int irq, int source);
-       int     (*init_model)(struct kvm *);
-       int     (*map_resources)(struct kvm *, const struct vgic_params *);
+enum iodev_type {
+       IODEV_CPUIF,
+       IODEV_DIST,
+       IODEV_REDIST,
+       IODEV_ITS
 };
 
 struct vgic_io_device {
-       gpa_t addr;
-       int len;
-       const struct vgic_io_range *reg_ranges;
-       struct kvm_vcpu *redist_vcpu;
+       gpa_t base_addr;
+       union {
+               struct kvm_vcpu *redist_vcpu;
+               struct vgic_its *its;
+       };
+       const struct vgic_register_region *regions;
+       enum iodev_type iodev_type;
+       int nr_regions;
        struct kvm_io_device dev;
 };
 
-struct irq_phys_map {
-       u32                     virt_irq;
-       u32                     phys_irq;
-};
-
-struct irq_phys_map_entry {
-       struct list_head        entry;
-       struct rcu_head         rcu;
-       struct irq_phys_map     map;
+struct vgic_its {
+       /* The base address of the ITS control register frame */
+       gpa_t                   vgic_its_base;
+
+       bool                    enabled;
+       bool                    initialized;
+       struct vgic_io_device   iodev;
+       struct kvm_device       *dev;
+
+       /* These registers correspond to GITS_BASER{0,1} */
+       u64                     baser_device_table;
+       u64                     baser_coll_table;
+
+       /* Protects the command queue */
+       struct mutex            cmd_lock;
+       u64                     cbaser;
+       u32                     creadr;
+       u32                     cwriter;
+
+       /* Protects the device and collection lists */
+       struct mutex            its_lock;
+       struct list_head        device_list;
+       struct list_head        collection_list;
 };
 
 struct vgic_dist {
-       spinlock_t              lock;
        bool                    in_kernel;
        bool                    ready;
+       bool                    initialized;
 
        /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
        u32                     vgic_model;
 
-       int                     nr_cpus;
-       int                     nr_irqs;
+       /* Do injected MSIs require an additional device ID? */
+       bool                    msis_require_devid;
+
+       int                     nr_spis;
 
+       /* TODO: Consider moving to global state */
        /* Virtual control interface mapping */
        void __iomem            *vctrl_base;
 
-       /* Distributor and vcpu interface mapping in the guest */
-       phys_addr_t             vgic_dist_base;
-       /* GICv2 and GICv3 use different mapped register blocks */
+       /* base addresses in guest physical address space: */
+       gpa_t                   vgic_dist_base;         /* distributor */
        union {
-               phys_addr_t             vgic_cpu_base;
-               phys_addr_t             vgic_redist_base;
+               /* either a GICv2 CPU interface */
+               gpa_t                   vgic_cpu_base;
+               /* or a number of GICv3 redistributor regions */
+               gpa_t                   vgic_redist_base;
        };
 
-       /* Distributor enabled */
-       u32                     enabled;
-
-       /* Interrupt enabled (one bit per IRQ) */
-       struct vgic_bitmap      irq_enabled;
-
-       /* Level-triggered interrupt external input is asserted */
-       struct vgic_bitmap      irq_level;
-
-       /*
-        * Interrupt state is pending on the distributor
-        */
-       struct vgic_bitmap      irq_pending;
-
-       /*
-        * Tracks writes to GICD_ISPENDRn and GICD_ICPENDRn for level-triggered
-        * interrupts.  Essentially holds the state of the flip-flop in
-        * Figure 4-10 on page 4-101 in ARM IHI 0048B.b.
-        * Once set, it is only cleared for level-triggered interrupts on
-        * guest ACKs (when we queue it) or writes to GICD_ICPENDRn.
-        */
-       struct vgic_bitmap      irq_soft_pend;
-
-       /* Level-triggered interrupt queued on VCPU interface */
-       struct vgic_bitmap      irq_queued;
-
-       /* Interrupt was active when unqueue from VCPU interface */
-       struct vgic_bitmap      irq_active;
-
-       /* Interrupt priority. Not used yet. */
-       struct vgic_bytemap     irq_priority;
+       /* distributor enabled */
+       bool                    enabled;
 
-       /* Level/edge triggered */
-       struct vgic_bitmap      irq_cfg;
+       struct vgic_irq         *spis;
 
-       /*
-        * Source CPU per SGI and target CPU:
-        *
-        * Each byte represent a SGI observable on a VCPU, each bit of
-        * this byte indicating if the corresponding VCPU has
-        * generated this interrupt. This is a GICv2 feature only.
-        *
-        * For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are
-        * the SGIs observable on VCPUn.
-        */
-       u8                      *irq_sgi_sources;
+       struct vgic_io_device   dist_iodev;
 
-       /*
-        * Target CPU for each SPI:
-        *
-        * Array of available SPI, each byte indicating the target
-        * VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32].
-        */
-       u8                      *irq_spi_cpu;
+       bool                    has_its;
 
        /*
-        * Reverse lookup of irq_spi_cpu for faster compute pending:
-        *
-        * Array of bitmaps, one per VCPU, describing if IRQn is
-        * routed to a particular VCPU.
+        * Contains the attributes and gpa of the LPI configuration table.
+        * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
+        * one address across all redistributors.
+        * GICv3 spec: 6.1.2 "LPI Configuration tables"
         */
-       struct vgic_bitmap      *irq_spi_target;
-
-       /* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
-       u32                     *irq_spi_mpidr;
+       u64                     propbaser;
 
-       /* Bitmap indicating which CPU has something pending */
-       unsigned long           *irq_pending_on_cpu;
-
-       /* Bitmap indicating which CPU has active IRQs */
-       unsigned long           *irq_active_on_cpu;
-
-       struct vgic_vm_ops      vm_ops;
-       struct vgic_io_device   dist_iodev;
-       struct vgic_io_device   *redist_iodevs;
-
-       /* Virtual irq to hwirq mapping */
-       spinlock_t              irq_phys_map_lock;
-       struct list_head        irq_phys_map_list;
+       /* Protects the lpi_list and the count value below. */
+       spinlock_t              lpi_list_lock;
+       struct list_head        lpi_list_head;
+       int                     lpi_list_count;
 };
 
 struct vgic_v2_cpu_if {
@@ -298,78 +230,88 @@ struct vgic_v3_cpu_if {
 };
 
 struct vgic_cpu {
-       /* Pending/active/both interrupts on this VCPU */
-       DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
-       DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
-       DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
-
-       /* Pending/active/both shared interrupts, dynamically sized */
-       unsigned long   *pending_shared;
-       unsigned long   *active_shared;
-       unsigned long   *pend_act_shared;
-
        /* CPU vif control registers for world switch */
        union {
                struct vgic_v2_cpu_if   vgic_v2;
                struct vgic_v3_cpu_if   vgic_v3;
        };
 
-       /* Protected by the distributor's irq_phys_map_lock */
-       struct list_head        irq_phys_map_list;
+       unsigned int used_lrs;
+       struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
 
-       u64             live_lrs;
-};
+       spinlock_t ap_list_lock;        /* Protects the ap_list */
+
+       /*
+        * List of IRQs that this VCPU should consider because they are either
+        * Active or Pending (hence the name; AP list), or because they recently
+        * were one of the two and need to be migrated off this list to another
+        * VCPU.
+        */
+       struct list_head ap_list_head;
 
-#define LR_EMPTY       0xff
+       u64 live_lrs;
 
-#define INT_STATUS_EOI         (1 << 0)
-#define INT_STATUS_UNDERFLOW   (1 << 1)
+       /*
+        * Members below are used with GICv3 emulation only and represent
+        * parts of the redistributor.
+        */
+       struct vgic_io_device   rd_iodev;
+       struct vgic_io_device   sgi_iodev;
 
-struct kvm;
-struct kvm_vcpu;
+       /* Contains the attributes and gpa of the LPI pending tables. */
+       u64 pendbaser;
+
+       bool lpis_enabled;
+};
 
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-int kvm_vgic_hyp_init(void);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_get_max_vcpus(void);
 void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 void kvm_vgic_destroy(struct kvm *kvm);
 void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
+int kvm_vgic_map_resources(struct kvm *kvm);
+int kvm_vgic_hyp_init(void);
+
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
                        bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-                              unsigned int virt_irq, bool level);
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                              bool level);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
+
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k)    (!!((k)->arch.vgic.nr_cpus))
+#define vgic_initialized(k)    ((k)->arch.vgic.initialized)
 #define vgic_ready(k)          ((k)->arch.vgic.ready)
 #define vgic_valid_spi(k, i)   (((i) >= VGIC_NR_PRIVATE_IRQS) && \
-                                ((i) < (k)->arch.vgic.nr_irqs))
+                       ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
+
+bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
-                 const struct vgic_ops **ops,
-                 const struct vgic_params **params);
 #ifdef CONFIG_KVM_ARM_VGIC_V3
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-                 const struct vgic_ops **ops,
-                 const struct vgic_params **params);
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 #else
-static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-                               const struct vgic_ops **ops,
-                               const struct vgic_params **params)
+static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
 {
-       return -ENODEV;
 }
 #endif
 
-#endif /* old VGIC include */
-#endif
+/**
+ * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ *
+ * The host's GIC naturally limits the maximum amount of VCPUs a guest
+ * can use.
+ */
+static inline int kvm_vgic_get_max_vcpus(void)
+{
+       return kvm_vgic_global_state.max_gic_vcpus;
+}
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+
+#endif /* __KVM_ARM_VGIC_H */
diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
deleted file mode 100644 (file)
index 3fbd175..0000000
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_ARM_KVM_VGIC_VGIC_H
-#define __ASM_ARM_KVM_VGIC_VGIC_H
-
-#include <linux/kernel.h>
-#include <linux/kvm.h>
-#include <linux/irqreturn.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <kvm/iodev.h>
-
-#define VGIC_V3_MAX_CPUS       255
-#define VGIC_V2_MAX_CPUS       8
-#define VGIC_NR_IRQS_LEGACY     256
-#define VGIC_NR_SGIS           16
-#define VGIC_NR_PPIS           16
-#define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_MAX_PRIVATE       (VGIC_NR_PRIVATE_IRQS - 1)
-#define VGIC_MAX_SPI           1019
-#define VGIC_MAX_RESERVED      1023
-#define VGIC_MIN_LPI           8192
-
-enum vgic_type {
-       VGIC_V2,                /* Good ol' GICv2 */
-       VGIC_V3,                /* New fancy GICv3 */
-};
-
-/* same for all guests, as depending only on the _host's_ GIC model */
-struct vgic_global {
-       /* type of the host GIC */
-       enum vgic_type          type;
-
-       /* Physical address of vgic virtual cpu interface */
-       phys_addr_t             vcpu_base;
-
-       /* virtual control interface mapping */
-       void __iomem            *vctrl_base;
-
-       /* Number of implemented list registers */
-       int                     nr_lr;
-
-       /* Maintenance IRQ number */
-       unsigned int            maint_irq;
-
-       /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
-       int                     max_gic_vcpus;
-
-       /* Only needed for the legacy KVM_CREATE_IRQCHIP */
-       bool                    can_emulate_gicv2;
-};
-
-extern struct vgic_global kvm_vgic_global_state;
-
-#define VGIC_V2_MAX_LRS                (1 << 6)
-#define VGIC_V3_MAX_LRS                16
-#define VGIC_V3_LR_INDEX(lr)   (VGIC_V3_MAX_LRS - 1 - lr)
-
-enum vgic_irq_config {
-       VGIC_CONFIG_EDGE = 0,
-       VGIC_CONFIG_LEVEL
-};
-
-struct vgic_irq {
-       spinlock_t irq_lock;            /* Protects the content of the struct */
-       struct list_head ap_list;
-
-       struct kvm_vcpu *vcpu;          /* SGIs and PPIs: The VCPU
-                                        * SPIs and LPIs: The VCPU whose ap_list
-                                        * this is queued on.
-                                        */
-
-       struct kvm_vcpu *target_vcpu;   /* The VCPU that this interrupt should
-                                        * be sent to, as a result of the
-                                        * targets reg (v2) or the
-                                        * affinity reg (v3).
-                                        */
-
-       u32 intid;                      /* Guest visible INTID */
-       bool pending;
-       bool line_level;                /* Level only */
-       bool soft_pending;              /* Level only */
-       bool active;                    /* not used for LPIs */
-       bool enabled;
-       bool hw;                        /* Tied to HW IRQ */
-       u32 hwintid;                    /* HW INTID number */
-       union {
-               u8 targets;                     /* GICv2 target VCPUs mask */
-               u32 mpidr;                      /* GICv3 target VCPU */
-       };
-       u8 source;                      /* GICv2 SGIs only */
-       u8 priority;
-       enum vgic_irq_config config;    /* Level or edge */
-};
-
-struct vgic_register_region;
-
-struct vgic_io_device {
-       gpa_t base_addr;
-       struct kvm_vcpu *redist_vcpu;
-       const struct vgic_register_region *regions;
-       int nr_regions;
-       struct kvm_io_device dev;
-};
-
-struct vgic_dist {
-       bool                    in_kernel;
-       bool                    ready;
-       bool                    initialized;
-
-       /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
-       u32                     vgic_model;
-
-       int                     nr_spis;
-
-       /* TODO: Consider moving to global state */
-       /* Virtual control interface mapping */
-       void __iomem            *vctrl_base;
-
-       /* base addresses in guest physical address space: */
-       gpa_t                   vgic_dist_base;         /* distributor */
-       union {
-               /* either a GICv2 CPU interface */
-               gpa_t                   vgic_cpu_base;
-               /* or a number of GICv3 redistributor regions */
-               gpa_t                   vgic_redist_base;
-       };
-
-       /* distributor enabled */
-       bool                    enabled;
-
-       struct vgic_irq         *spis;
-
-       struct vgic_io_device   dist_iodev;
-       struct vgic_io_device   *redist_iodevs;
-};
-
-struct vgic_v2_cpu_if {
-       u32             vgic_hcr;
-       u32             vgic_vmcr;
-       u32             vgic_misr;      /* Saved only */
-       u64             vgic_eisr;      /* Saved only */
-       u64             vgic_elrsr;     /* Saved only */
-       u32             vgic_apr;
-       u32             vgic_lr[VGIC_V2_MAX_LRS];
-};
-
-struct vgic_v3_cpu_if {
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-       u32             vgic_hcr;
-       u32             vgic_vmcr;
-       u32             vgic_sre;       /* Restored only, change ignored */
-       u32             vgic_misr;      /* Saved only */
-       u32             vgic_eisr;      /* Saved only */
-       u32             vgic_elrsr;     /* Saved only */
-       u32             vgic_ap0r[4];
-       u32             vgic_ap1r[4];
-       u64             vgic_lr[VGIC_V3_MAX_LRS];
-#endif
-};
-
-struct vgic_cpu {
-       /* CPU vif control registers for world switch */
-       union {
-               struct vgic_v2_cpu_if   vgic_v2;
-               struct vgic_v3_cpu_if   vgic_v3;
-       };
-
-       unsigned int used_lrs;
-       struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
-
-       spinlock_t ap_list_lock;        /* Protects the ap_list */
-
-       /*
-        * List of IRQs that this VCPU should consider because they are either
-        * Active or Pending (hence the name; AP list), or because they recently
-        * were one of the two and need to be migrated off this list to another
-        * VCPU.
-        */
-       struct list_head ap_list_head;
-
-       u64 live_lrs;
-};
-
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-void kvm_vgic_early_init(struct kvm *kvm);
-int kvm_vgic_create(struct kvm *kvm, u32 type);
-void kvm_vgic_destroy(struct kvm *kvm);
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_hyp_init(void);
-
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-                       bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-                              bool level);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-
-#define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k)    ((k)->arch.vgic.initialized)
-#define vgic_ready(k)          ((k)->arch.vgic.ready)
-#define vgic_valid_spi(k, i)   (((i) >= VGIC_NR_PRIVATE_IRQS) && \
-                       ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
-
-bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-#else
-static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-}
-#endif
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-static inline int kvm_vgic_get_max_vcpus(void)
-{
-       return kvm_vgic_global_state.max_gic_vcpus;
-}
-
-#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */
index dfce616..7868d60 100644 (file)
@@ -34,9 +34,9 @@
 #define CEPH_MAX_MON   31
 
 /*
- * ceph_file_layout - describe data layout for a file/inode
+ * legacy ceph_file_layoute
  */
-struct ceph_file_layout {
+struct ceph_file_layout_legacy {
        /* file -> object mapping */
        __le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
                                      of page size. */
@@ -53,33 +53,27 @@ struct ceph_file_layout {
        __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
 } __attribute__ ((packed));
 
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
-       ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
-       ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_pool(l) \
-       ((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
-       return le32_to_cpu(l->fl_stripe_unit) *
-               le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
-       return le32_to_cpu(l->fl_object_size) *
-               le32_to_cpu(l->fl_stripe_count);
-}
+struct ceph_string;
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+       /* file -> object mapping */
+       u32 stripe_unit;   /* stripe unit, in bytes */
+       u32 stripe_count;  /* over this many objects */
+       u32 object_size;   /* until objects are this big */
+       s64 pool_id;        /* rados pool id */
+       struct ceph_string __rcu *pool_ns; /* rados pool namespace */
+};
+
+extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+                               struct ceph_file_layout_legacy *legacy);
+extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+                               struct ceph_file_layout_legacy *legacy);
 
 #define CEPH_MIN_STRIPE_UNIT 65536
 
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-
 struct ceph_dir_layout {
        __u8   dl_dir_hash;   /* see ceph_hash.h for ids */
        __u8   dl_unused1;
@@ -127,6 +121,7 @@ struct ceph_dir_layout {
 
 /* client <-> mds */
 #define CEPH_MSG_MDS_MAP                21
+#define CEPH_MSG_FS_MAP_USER            103
 
 #define CEPH_MSG_CLIENT_SESSION         22
 #define CEPH_MSG_CLIENT_RECONNECT       23
@@ -399,7 +394,7 @@ union ceph_mds_request_args {
                __le32 flags;
        } __attribute__ ((packed)) setxattr;
        struct {
-               struct ceph_file_layout layout;
+               struct ceph_file_layout_legacy layout;
        } __attribute__ ((packed)) setlayout;
        struct {
                __u8 rule; /* currently fcntl or flock */
@@ -478,7 +473,7 @@ struct ceph_mds_reply_inode {
        __le64 version;                /* inode version */
        __le64 xattr_version;          /* version for xattr blob */
        struct ceph_mds_reply_cap cap; /* caps issued for this inode */
-       struct ceph_file_layout layout;
+       struct ceph_file_layout_legacy layout;
        struct ceph_timespec ctime, mtime, atime;
        __le32 time_warp_seq;
        __le64 size, max_size, truncate_size;
@@ -531,7 +526,7 @@ struct ceph_filelock {
 #define CEPH_FILE_MODE_WR         2
 #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
 #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
-#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+#define CEPH_FILE_MODE_BITS       4
 
 int ceph_flags_to_mode(int flags);
 
@@ -673,7 +668,7 @@ struct ceph_mds_caps {
        __le64 size, max_size, truncate_size;
        __le32 truncate_seq;
        struct ceph_timespec mtime, atime, ctime;
-       struct ceph_file_layout layout;
+       struct ceph_file_layout_legacy layout;
        __le32 time_warp_seq;
 } __attribute__ ((packed));
 
index 19e9932..f990f2c 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/err.h>
 #include <linux/bug.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <asm/unaligned.h>
 
@@ -217,6 +218,60 @@ static inline void ceph_encode_string(void **p, void *end,
        *p += len;
 }
 
+/*
+ * version and length starting block encoders/decoders
+ */
+
+/* current code version (u8) + compat code version (u8) + len of struct (u32) */
+#define CEPH_ENCODING_START_BLK_LEN 6
+
+/**
+ * ceph_start_encoding - start encoding block
+ * @struct_v: current (code) version of the encoding
+ * @struct_compat: oldest code version that can decode it
+ * @struct_len: length of struct encoding
+ */
+static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat,
+                                      u32 struct_len)
+{
+       ceph_encode_8(p, struct_v);
+       ceph_encode_8(p, struct_compat);
+       ceph_encode_32(p, struct_len);
+}
+
+/**
+ * ceph_start_decoding - start decoding block
+ * @v: current version of the encoding that the code supports
+ * @name: name of the struct (free-form)
+ * @struct_v: out param for the encoding version
+ * @struct_len: out param for the length of struct encoding
+ *
+ * Validates the length of struct encoding, so unsafe ceph_decode_*
+ * variants can be used for decoding.
+ */
+static inline int ceph_start_decoding(void **p, void *end, u8 v,
+                                     const char *name, u8 *struct_v,
+                                     u32 *struct_len)
+{
+       u8 struct_compat;
+
+       ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad);
+       *struct_v = ceph_decode_8(p);
+       struct_compat = ceph_decode_8(p);
+       if (v < struct_compat) {
+               pr_warn("got struct_v %d struct_compat %d > %d of %s\n",
+                       *struct_v, struct_compat, v, name);
+               return -EINVAL;
+       }
+
+       *struct_len = ceph_decode_32(p);
+       ceph_decode_need(p, end, *struct_len, bad);
+       return 0;
+
+bad:
+       return -ERANGE;
+}
+
 #define ceph_encode_need(p, end, n, bad)                       \
        do {                                                    \
                if (!likely(ceph_has_room(p, end, n)))          \
index 690985d..83fc1ff 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/osd_client.h>
 #include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/string_table.h>
 
 /*
  * mount options
@@ -214,8 +215,9 @@ static void erase_##name(struct rb_root *root, type *t)                     \
 }
 
 #define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)             \
+extern type __lookup_##name##_key;                                     \
 static type *lookup_##name(struct rb_root *root,                       \
-                          typeof(((type *)0)->keyfld) key)             \
+                          typeof(__lookup_##name##_key.keyfld) key)    \
 {                                                                      \
        struct rb_node *n = root->rb_node;                              \
                                                                        \
index e2a92df..24d704d 100644 (file)
@@ -95,7 +95,7 @@ struct ceph_mon_client {
                struct ceph_mon_subscribe_item item;
                bool want;
                u32 have; /* epoch */
-       } subs[3];
+       } subs[4];
        int fs_cluster_id; /* "mdsmap.<id>" sub */
 
 #ifdef CONFIG_DEBUG_FS
@@ -111,9 +111,10 @@ extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
 extern void ceph_monc_stop(struct ceph_mon_client *monc);
 
 enum {
-       CEPH_SUB_MDSMAP = 0,
-       CEPH_SUB_MONMAP,
+       CEPH_SUB_MONMAP = 0,
        CEPH_SUB_OSDMAP,
+       CEPH_SUB_FSMAP,
+       CEPH_SUB_MDSMAP,
 };
 
 extern const char *ceph_sub_str[];
index 4b0d389..ddd0d48 100644 (file)
@@ -2,7 +2,6 @@
 #define _FS_CEPH_MSGPOOL
 
 #include <linux/mempool.h>
-#include <linux/ceph/messenger.h>
 
 /*
  * we use memory pools for preallocating messages we may receive, to
index 1b3b6e1..8589323 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/ceph/types.h>
 #include <linux/ceph/osdmap.h>
 #include <linux/ceph/messenger.h>
+#include <linux/ceph/msgpool.h>
 #include <linux/ceph/auth.h>
 #include <linux/ceph/pagelist.h>
 
index 9ccf4db..9a90417 100644 (file)
@@ -63,11 +63,13 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
 
 struct ceph_object_locator {
        s64 pool;
+       struct ceph_string *pool_ns;
 };
 
 static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
 {
        oloc->pool = -1;
+       oloc->pool_ns = NULL;
 }
 
 static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
@@ -75,11 +77,9 @@ static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
        return oloc->pool == -1;
 }
 
-static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
-                                 const struct ceph_object_locator *src)
-{
-       dest->pool = src->pool;
-}
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+                   const struct ceph_object_locator *src);
+void ceph_oloc_destroy(struct ceph_object_locator *oloc);
 
 /*
  * Maximum supported by kernel client object name length
@@ -115,6 +115,11 @@ static inline void ceph_oid_init(struct ceph_object_id *oid)
        oid->name_len = 0;
 }
 
+#define CEPH_OID_INIT_ONSTACK(oid)                                     \
+    ({ ceph_oid_init(&oid); oid; })
+#define CEPH_DEFINE_OID_ONSTACK(oid)                                   \
+       struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid)
+
 static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
 {
        return oid->name == oid->inline_name && !oid->name_len;
diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
new file mode 100644 (file)
index 0000000..1b02c96
--- /dev/null
@@ -0,0 +1,62 @@
+#ifndef _FS_CEPH_STRING_TABLE_H
+#define _FS_CEPH_STRING_TABLE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+
+struct ceph_string {
+       struct kref kref;
+       union {
+               struct rb_node node;
+               struct rcu_head rcu;
+       };
+       size_t len;
+       char str[];
+};
+
+extern void ceph_release_string(struct kref *ref);
+extern struct ceph_string *ceph_find_or_create_string(const char *str,
+                                                     size_t len);
+extern bool ceph_strings_empty(void);
+
+static inline struct ceph_string *ceph_get_string(struct ceph_string *str)
+{
+       kref_get(&str->kref);
+       return str;
+}
+
+static inline void ceph_put_string(struct ceph_string *str)
+{
+       if (!str)
+               return;
+       kref_put(&str->kref, ceph_release_string);
+}
+
+static inline int ceph_compare_string(struct ceph_string *cs,
+                                     const char* str, size_t len)
+{
+       size_t cs_len = cs ? cs->len : 0;
+       if (cs_len != len)
+               return cs_len - len;
+       if (len == 0)
+               return 0;
+       return strncmp(cs->str, str, len);
+}
+
+#define ceph_try_get_string(x)                                 \
+({                                                             \
+       struct ceph_string *___str;                             \
+       rcu_read_lock();                                        \
+       for (;;) {                                              \
+               ___str = rcu_dereference(x);                    \
+               if (!___str ||                                  \
+                   kref_get_unless_zero(&___str->kref))        \
+                       break;                                  \
+       }                                                       \
+       rcu_read_unlock();                                      \
+       (___str);                                               \
+})
+
+#endif
index d9aef2a..c78fc27 100644 (file)
@@ -99,7 +99,8 @@ static inline void context_tracking_init(void) { }
 
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static inline void guest_enter(void)
+/* must be called with irqs disabled */
+static inline void guest_enter_irqoff(void)
 {
        if (vtime_accounting_cpu_enabled())
                vtime_guest_enter(current);
@@ -108,9 +109,19 @@ static inline void guest_enter(void)
 
        if (context_tracking_is_enabled())
                __context_tracking_enter(CONTEXT_GUEST);
+
+       /* KVM does not hold any references to rcu protected data when it
+        * switches CPU into a guest mode. In fact switching to a guest mode
+        * is very similar to exiting to userspace from rcu point of view. In
+        * addition CPU may stay in a guest mode for quite a long time (up to
+        * one time slice). Lets treat guest mode as quiescent state, just like
+        * we do with user-mode execution.
+        */
+       if (!context_tracking_cpu_is_enabled())
+               rcu_virt_note_context_switch(smp_processor_id());
 }
 
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
 {
        if (context_tracking_is_enabled())
                __context_tracking_exit(CONTEXT_GUEST);
@@ -122,7 +133,7 @@ static inline void guest_exit(void)
 }
 
 #else
-static inline void guest_enter(void)
+static inline void guest_enter_irqoff(void)
 {
        /*
         * This is running in ioctl context so its safe
@@ -131,9 +142,10 @@ static inline void guest_enter(void)
         */
        vtime_account_system(current);
        current->flags |= PF_VCPU;
+       rcu_virt_note_context_switch(smp_processor_id());
 }
 
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
 {
        /* Flush the guest cputime we spent on the guest */
        vtime_account_system(current);
@@ -141,4 +153,22 @@ static inline void guest_exit(void)
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
 
+static inline void guest_enter(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       guest_enter_irqoff();
+       local_irq_restore(flags);
+}
+
+static inline void guest_exit(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       guest_exit_irqoff();
+       local_irq_restore(flags);
+}
+
 #endif
index 2f9ccbe..c565f87 100644 (file)
@@ -82,7 +82,7 @@ extern struct module __this_module;
 #include <generated/autoksyms.h>
 
 #define __EXPORT_SYMBOL(sym, sec)                              \
-       __cond_export_sym(sym, sec, config_enabled(__KSYM_##sym))
+       __cond_export_sym(sym, sec, __is_defined(__KSYM_##sym))
 #define __cond_export_sym(sym, sec, conf)                      \
        ___cond_export_sym(sym, sec, conf)
 #define ___cond_export_sym(sym, sec, enabled)                  \
index 107eed4..56b0b7e 100644 (file)
 #define GICR_WAKER_ProcessorSleep      (1U << 1)
 #define GICR_WAKER_ChildrenAsleep      (1U << 2)
 
-#define GICR_PROPBASER_NonShareable    (0U << 10)
-#define GICR_PROPBASER_InnerShareable  (1U << 10)
-#define GICR_PROPBASER_OuterShareable  (2U << 10)
-#define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PROPBASER_nCnB            (0U << 7)
-#define GICR_PROPBASER_nC              (1U << 7)
-#define GICR_PROPBASER_RaWt            (2U << 7)
-#define GICR_PROPBASER_RaWb            (3U << 7)
-#define GICR_PROPBASER_WaWt            (4U << 7)
-#define GICR_PROPBASER_WaWb            (5U << 7)
-#define GICR_PROPBASER_RaWaWt          (6U << 7)
-#define GICR_PROPBASER_RaWaWb          (7U << 7)
-#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
-#define GICR_PROPBASER_IDBITS_MASK     (0x1f)
-
-#define GICR_PENDBASER_NonShareable    (0U << 10)
-#define GICR_PENDBASER_InnerShareable  (1U << 10)
-#define GICR_PENDBASER_OuterShareable  (2U << 10)
-#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PENDBASER_nCnB            (0U << 7)
-#define GICR_PENDBASER_nC              (1U << 7)
-#define GICR_PENDBASER_RaWt            (2U << 7)
-#define GICR_PENDBASER_RaWb            (3U << 7)
-#define GICR_PENDBASER_WaWt            (4U << 7)
-#define GICR_PENDBASER_WaWb            (5U << 7)
-#define GICR_PENDBASER_RaWaWt          (6U << 7)
-#define GICR_PENDBASER_RaWaWb          (7U << 7)
-#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
+#define GIC_BASER_CACHE_nCnB           0ULL
+#define GIC_BASER_CACHE_SameAsInner    0ULL
+#define GIC_BASER_CACHE_nC             1ULL
+#define GIC_BASER_CACHE_RaWt           2ULL
+#define GIC_BASER_CACHE_RaWb           3ULL
+#define GIC_BASER_CACHE_WaWt           4ULL
+#define GIC_BASER_CACHE_WaWb           5ULL
+#define GIC_BASER_CACHE_RaWaWt         6ULL
+#define GIC_BASER_CACHE_RaWaWb         7ULL
+#define GIC_BASER_CACHE_MASK           7ULL
+#define GIC_BASER_NonShareable         0ULL
+#define GIC_BASER_InnerShareable       1ULL
+#define GIC_BASER_OuterShareable       2ULL
+#define GIC_BASER_SHAREABILITY_MASK    3ULL
+
+#define GIC_BASER_CACHEABILITY(reg, inner_outer, type)                 \
+       (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
+
+#define GIC_BASER_SHAREABILITY(reg, type)                              \
+       (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
+
+#define GICR_PROPBASER_SHAREABILITY_SHIFT              (10)
+#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT                (7)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT                (56)
+#define GICR_PROPBASER_SHAREABILITY_MASK                               \
+       GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
+#define GICR_PROPBASER_INNER_CACHEABILITY_MASK                         \
+       GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK                         \
+       GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
+#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PROPBASER_InnerShareable                                  \
+       GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
+
+#define GICR_PROPBASER_nCnB    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
+#define GICR_PROPBASER_nC      GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+#define GICR_PROPBASER_RaWt    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_RaWb    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_WaWt    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
+#define GICR_PROPBASER_WaWb    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+#define GICR_PROPBASER_RaWaWt  GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
+#define GICR_PROPBASER_RaWaWb  GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
+
+#define GICR_PROPBASER_IDBITS_MASK                     (0x1f)
+
+#define GICR_PENDBASER_SHAREABILITY_SHIFT              (10)
+#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT                (7)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT                (56)
+#define GICR_PENDBASER_SHAREABILITY_MASK                               \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
+#define GICR_PENDBASER_INNER_CACHEABILITY_MASK                         \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK                         \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
+#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PENDBASER_InnerShareable                                  \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
+
+#define GICR_PENDBASER_nCnB    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
+#define GICR_PENDBASER_nC      GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+#define GICR_PENDBASER_RaWt    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_RaWb    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_WaWt    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
+#define GICR_PENDBASER_WaWb    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+#define GICR_PENDBASER_RaWaWt  GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
+#define GICR_PENDBASER_RaWaWb  GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
+
+#define GICR_PENDBASER_PTZ                             BIT_ULL(62)
 
 /*
  * Re-Distributor registers, offsets from SGI_base
 #define GITS_CWRITER                   0x0088
 #define GITS_CREADR                    0x0090
 #define GITS_BASER                     0x0100
+#define GITS_IDREGS_BASE               0xffd0
+#define GITS_PIDR0                     0xffe0
+#define GITS_PIDR1                     0xffe4
 #define GITS_PIDR2                     GICR_PIDR2
+#define GITS_PIDR4                     0xffd0
+#define GITS_CIDR0                     0xfff0
+#define GITS_CIDR1                     0xfff4
+#define GITS_CIDR2                     0xfff8
+#define GITS_CIDR3                     0xfffc
 
 #define GITS_TRANSLATER                        0x10040
 
 #define GITS_CTLR_ENABLE               (1U << 0)
 #define GITS_CTLR_QUIESCENT            (1U << 31)
 
+#define GITS_TYPER_PLPIS               (1UL << 0)
+#define GITS_TYPER_IDBITS_SHIFT                8
 #define GITS_TYPER_DEVBITS_SHIFT       13
 #define GITS_TYPER_DEVBITS(r)          ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
 #define GITS_TYPER_PTA                 (1UL << 19)
-
-#define GITS_CBASER_VALID              (1UL << 63)
-#define GITS_CBASER_nCnB               (0UL << 59)
-#define GITS_CBASER_nC                 (1UL << 59)
-#define GITS_CBASER_RaWt               (2UL << 59)
-#define GITS_CBASER_RaWb               (3UL << 59)
-#define GITS_CBASER_WaWt               (4UL << 59)
-#define GITS_CBASER_WaWb               (5UL << 59)
-#define GITS_CBASER_RaWaWt             (6UL << 59)
-#define GITS_CBASER_RaWaWb             (7UL << 59)
-#define GITS_CBASER_CACHEABILITY_MASK  (7UL << 59)
-#define GITS_CBASER_NonShareable       (0UL << 10)
-#define GITS_CBASER_InnerShareable     (1UL << 10)
-#define GITS_CBASER_OuterShareable     (2UL << 10)
-#define GITS_CBASER_SHAREABILITY_MASK  (3UL << 10)
+#define GITS_TYPER_HWCOLLCNT_SHIFT     24
+
+#define GITS_CBASER_VALID                      (1UL << 63)
+#define GITS_CBASER_SHAREABILITY_SHIFT         (10)
+#define GITS_CBASER_INNER_CACHEABILITY_SHIFT   (59)
+#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT   (53)
+#define GITS_CBASER_SHAREABILITY_MASK                                  \
+       GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
+#define GITS_CBASER_INNER_CACHEABILITY_MASK                            \
+       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
+#define GITS_CBASER_OUTER_CACHEABILITY_MASK                            \
+       GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
+#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
+
+#define GITS_CBASER_InnerShareable                                     \
+       GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
+
+#define GITS_CBASER_nCnB       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
+#define GITS_CBASER_nC         GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+#define GITS_CBASER_RaWt       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_RaWb       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_WaWt       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
+#define GITS_CBASER_WaWb       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+#define GITS_CBASER_RaWaWt     GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
+#define GITS_CBASER_RaWaWb     GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
 
 #define GITS_BASER_NR_REGS             8
 
-#define GITS_BASER_VALID               (1UL << 63)
-#define GITS_BASER_INDIRECT            (1UL << 62)
-#define GITS_BASER_nCnB                        (0UL << 59)
-#define GITS_BASER_nC                  (1UL << 59)
-#define GITS_BASER_RaWt                        (2UL << 59)
-#define GITS_BASER_RaWb                        (3UL << 59)
-#define GITS_BASER_WaWt                        (4UL << 59)
-#define GITS_BASER_WaWb                        (5UL << 59)
-#define GITS_BASER_RaWaWt              (6UL << 59)
-#define GITS_BASER_RaWaWb              (7UL << 59)
-#define GITS_BASER_CACHEABILITY_MASK   (7UL << 59)
-#define GITS_BASER_TYPE_SHIFT          (56)
+#define GITS_BASER_VALID                       (1UL << 63)
+#define GITS_BASER_INDIRECT                    (1ULL << 62)
+
+#define GITS_BASER_INNER_CACHEABILITY_SHIFT    (59)
+#define GITS_BASER_OUTER_CACHEABILITY_SHIFT    (53)
+#define GITS_BASER_INNER_CACHEABILITY_MASK                             \
+       GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+#define GITS_BASER_CACHEABILITY_MASK           GITS_BASER_INNER_CACHEABILITY_MASK
+#define GITS_BASER_OUTER_CACHEABILITY_MASK                             \
+       GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
+#define GITS_BASER_SHAREABILITY_MASK                                   \
+       GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
+
+#define GITS_BASER_nCnB                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
+#define GITS_BASER_nC          GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+#define GITS_BASER_RaWt                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_RaWb                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_WaWt                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
+#define GITS_BASER_WaWb                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+#define GITS_BASER_RaWaWt      GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
+#define GITS_BASER_RaWaWb      GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
+
+#define GITS_BASER_TYPE_SHIFT                  (56)
 #define GITS_BASER_TYPE(r)             (((r) >> GITS_BASER_TYPE_SHIFT) & 7)
-#define GITS_BASER_ENTRY_SIZE_SHIFT    (48)
+#define GITS_BASER_ENTRY_SIZE_SHIFT            (48)
 #define GITS_BASER_ENTRY_SIZE(r)       ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1)
-#define GITS_BASER_NonShareable                (0UL << 10)
-#define GITS_BASER_InnerShareable      (1UL << 10)
-#define GITS_BASER_OuterShareable      (2UL << 10)
 #define GITS_BASER_SHAREABILITY_SHIFT  (10)
-#define GITS_BASER_SHAREABILITY_MASK   (3UL << GITS_BASER_SHAREABILITY_SHIFT)
+#define GITS_BASER_InnerShareable                                      \
+       GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
 #define GITS_BASER_PAGE_SIZE_SHIFT     (8)
 #define GITS_BASER_PAGE_SIZE_4K                (0UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGE_SIZE_16K       (1UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGE_SIZE_MASK      (3UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGES_MAX           256
 #define GITS_BASER_PAGES_SHIFT         (0)
+#define GITS_BASER_NR_PAGES(r)         (((r) & 0xff) + 1)
 
 #define GITS_BASER_TYPE_NONE           0
 #define GITS_BASER_TYPE_DEVICE         1
  */
 #define GITS_CMD_MAPD                  0x08
 #define GITS_CMD_MAPC                  0x09
-#define GITS_CMD_MAPVI                 0x0a
+#define GITS_CMD_MAPTI                 0x0a
+/* older GIC documentation used MAPVI for this command */
+#define GITS_CMD_MAPVI                 GITS_CMD_MAPTI
+#define GITS_CMD_MAPI                  0x0b
 #define GITS_CMD_MOVI                  0x01
 #define GITS_CMD_DISCARD               0x0f
 #define GITS_CMD_INV                   0x0c
 #define GITS_CMD_CLEAR                 0x04
 #define GITS_CMD_SYNC                  0x05
 
+/*
+ * ITS error numbers
+ */
+#define E_ITS_MOVI_UNMAPPED_INTERRUPT          0x010107
+#define E_ITS_MOVI_UNMAPPED_COLLECTION         0x010109
+#define E_ITS_CLEAR_UNMAPPED_INTERRUPT         0x010507
+#define E_ITS_MAPD_DEVICE_OOR                  0x010801
+#define E_ITS_MAPC_PROCNUM_OOR                 0x010902
+#define E_ITS_MAPC_COLLECTION_OOR              0x010903
+#define E_ITS_MAPTI_UNMAPPED_DEVICE            0x010a04
+#define E_ITS_MAPTI_PHYSICALID_OOR             0x010a06
+#define E_ITS_INV_UNMAPPED_INTERRUPT           0x010c07
+#define E_ITS_INVALL_UNMAPPED_COLLECTION       0x010d09
+#define E_ITS_MOVALL_PROCNUM_OOR               0x010e01
+#define E_ITS_DISCARD_UNMAPPED_INTERRUPT       0x010f07
+
 /*
  * CPU interface registers
  */
index b33c779..15ec117 100644 (file)
@@ -3,6 +3,21 @@
 
 #include <generated/autoconf.h>
 
+#define __ARG_PLACEHOLDER_1 0,
+#define __take_second_arg(__ignored, val, ...) val
+
+/*
+ * The use of "&&" / "||" is limited in certain expressions.
+ * The followings enable to calculate "and" / "or" with macro expansion only.
+ */
+#define __and(x, y)                    ___and(x, y)
+#define ___and(x, y)                   ____and(__ARG_PLACEHOLDER_##x, y)
+#define ____and(arg1_or_junk, y)       __take_second_arg(arg1_or_junk y, 0)
+
+#define __or(x, y)                     ___or(x, y)
+#define ___or(x, y)                    ____or(__ARG_PLACEHOLDER_##x, y)
+#define ____or(arg1_or_junk, y)                __take_second_arg(arg1_or_junk 1, y)
+
 /*
  * Helper macros to use CONFIG_ options in C/CPP expressions. Note that
  * these only work with boolean and tristate options.
  * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when
  * the last step cherry picks the 2nd arg, we get a zero.
  */
-#define __ARG_PLACEHOLDER_1 0,
-#define config_enabled(cfg) _config_enabled(cfg)
-#define _config_enabled(value) __config_enabled(__ARG_PLACEHOLDER_##value)
-#define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0)
-#define ___config_enabled(__ignored, val, ...) val
+#define config_enabled(cfg)            ___is_defined(cfg)
+#define __is_defined(x)                        ___is_defined(x)
+#define ___is_defined(val)             ____is_defined(__ARG_PLACEHOLDER_##val)
+#define ____is_defined(arg1_or_junk)   __take_second_arg(arg1_or_junk 1, 0)
 
 /*
  * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0
  * This is similar to IS_ENABLED(), but returns false when invoked from
  * built-in code when CONFIG_FOO is set to 'm'.
  */
-#define IS_REACHABLE(option) (config_enabled(option) || \
-                (config_enabled(option##_MODULE) && config_enabled(MODULE)))
+#define IS_REACHABLE(option) __or(IS_BUILTIN(option), \
+                               __and(IS_MODULE(option), __is_defined(MODULE)))
 
 /*
  * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm',
  * 0 otherwise.
  */
-#define IS_ENABLED(option) \
-       (IS_BUILTIN(option) || IS_MODULE(option))
+#define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option))
 
 #endif /* __LINUX_KCONFIG_H */
index 1c9c973..aafd702 100644 (file)
@@ -164,6 +164,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                              struct kvm_io_device *dev);
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                                        gpa_t addr);
 
 #ifdef CONFIG_KVM_ASYNC_PF
 struct kvm_async_pf {
@@ -371,7 +373,15 @@ struct kvm {
        struct srcu_struct srcu;
        struct srcu_struct irq_srcu;
        struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+       /*
+        * created_vcpus is protected by kvm->lock, and is incremented
+        * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
+        * incremented after storing the kvm_vcpu pointer in vcpus,
+        * and is accessed atomically.
+        */
        atomic_t online_vcpus;
+       int created_vcpus;
        int last_boosted_vcpu;
        struct list_head vm_list;
        struct mutex lock;
@@ -867,45 +877,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 }
 #endif
 
-/* must be called with irqs disabled */
-static inline void __kvm_guest_enter(void)
-{
-       guest_enter();
-       /* KVM does not hold any references to rcu protected data when it
-        * switches CPU into a guest mode. In fact switching to a guest mode
-        * is very similar to exiting to userspace from rcu point of view. In
-        * addition CPU may stay in a guest mode for quite a long time (up to
-        * one time slice). Lets treat guest mode as quiescent state, just like
-        * we do with user-mode execution.
-        */
-       if (!context_tracking_cpu_is_enabled())
-               rcu_virt_note_context_switch(smp_processor_id());
-}
-
-/* must be called with irqs disabled */
-static inline void __kvm_guest_exit(void)
-{
-       guest_exit();
-}
-
-static inline void kvm_guest_enter(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __kvm_guest_enter();
-       local_irq_restore(flags);
-}
-
-static inline void kvm_guest_exit(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __kvm_guest_exit();
-       local_irq_restore(flags);
-}
-
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
@@ -1042,7 +1013,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        const struct kvm_irq_routing_entry *entries,
                        unsigned nr,
                        unsigned flags);
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue);
 void kvm_free_irq_routing(struct kvm *kvm);
 
@@ -1097,12 +1069,6 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
-#else
-static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
-#endif
-
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 {
        /*
index fbe8e16..8dd6e01 100644 (file)
@@ -783,6 +783,7 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv)
  * NAND Flash Manufacturer ID Codes
  */
 #define NAND_MFR_TOSHIBA       0x98
+#define NAND_MFR_ESMT          0xc8
 #define NAND_MFR_SAMSUNG       0xec
 #define NAND_MFR_FUJITSU       0x04
 #define NAND_MFR_NATIONAL      0x8f
index 7f041bd..c425c7b 100644 (file)
@@ -173,10 +173,10 @@ struct spi_nor {
        int (*read_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len);
        int (*write_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len);
 
-       int (*read)(struct spi_nor *nor, loff_t from,
-                       size_t len, size_t *retlen, u_char *read_buf);
-       void (*write)(struct spi_nor *nor, loff_t to,
-                       size_t len, size_t *retlen, const u_char *write_buf);
+       ssize_t (*read)(struct spi_nor *nor, loff_t from,
+                       size_t len, u_char *read_buf);
+       ssize_t (*write)(struct spi_nor *nor, loff_t to,
+                       size_t len, const u_char *write_buf);
        int (*erase)(struct spi_nor *nor, loff_t offs);
 
        int (*flash_lock)(struct spi_nor *nor, loff_t ofs, uint64_t len);
index 8b5e0a9..610e132 100644 (file)
@@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
        return ret;
 }
 
+static inline int page_ref_inc_return(struct page *page)
+{
+       int ret = atomic_inc_return(&page->_refcount);
+
+       if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
+               __page_ref_mod_and_return(page, 1, ret);
+       return ret;
+}
+
 static inline int page_ref_dec_and_test(struct page *page)
 {
        int ret = atomic_dec_and_test(&page->_refcount);
index 89ab057..7d63a66 100644 (file)
@@ -24,6 +24,8 @@ static inline acpi_status pci_acpi_remove_pm_notifier(struct acpi_device *dev)
 }
 extern phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle);
 
+extern phys_addr_t pci_mcfg_lookup(u16 domain, struct resource *bus_res);
+
 static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
 {
        struct pci_bus *pbus = pdev->bus;
diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
new file mode 100644 (file)
index 0000000..7adad20
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+#ifndef DRIVERS_PCI_ECAM_H
+#define DRIVERS_PCI_ECAM_H
+
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+
+/*
+ * struct to hold pci ops and bus shift of the config window
+ * for a PCI controller.
+ */
+struct pci_config_window;
+struct pci_ecam_ops {
+       unsigned int                    bus_shift;
+       struct pci_ops                  pci_ops;
+       int                             (*init)(struct pci_config_window *);
+};
+
+/*
+ * struct to hold the mappings of a config space window. This
+ * is expected to be used as sysdata for PCI controllers that
+ * use ECAM.
+ */
+struct pci_config_window {
+       struct resource                 res;
+       struct resource                 busr;
+       void                            *priv;
+       struct pci_ecam_ops             *ops;
+       union {
+               void __iomem            *win;   /* 64-bit single mapping */
+               void __iomem            **winp; /* 32-bit per-bus mapping */
+       };
+       struct device                   *parent;/* ECAM res was from this dev */
+};
+
+/* create and free pci_config_window */
+struct pci_config_window *pci_ecam_create(struct device *dev,
+               struct resource *cfgres, struct resource *busr,
+               struct pci_ecam_ops *ops);
+void pci_ecam_free(struct pci_config_window *cfg);
+
+/* map_bus when ->sysdata is an instance of pci_config_window */
+void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn,
+                              int where);
+/* default ECAM ops */
+extern struct pci_ecam_ops pci_generic_ecam_ops;
+
+#ifdef CONFIG_PCI_HOST_GENERIC
+/* for DT-based PCI controllers that support ECAM */
+int pci_host_common_probe(struct platform_device *pdev,
+                         struct pci_ecam_ops *ops);
+#endif
+#endif
index c40ac91..2599a98 100644 (file)
@@ -101,6 +101,10 @@ enum {
        DEVICE_COUNT_RESOURCE = PCI_NUM_RESOURCES,
 };
 
+/*
+ * pci_power_t values must match the bits in the Capabilities PME_Support
+ * and Control/Status PowerState fields in the Power Management capability.
+ */
 typedef int __bitwise pci_power_t;
 
 #define PCI_D0         ((pci_power_t __force) 0)
@@ -116,7 +120,7 @@ extern const char *pci_power_names[];
 
 static inline const char *pci_power_name(pci_power_t state)
 {
-       return pci_power_names[1 + (int) state];
+       return pci_power_names[1 + (__force int) state];
 }
 
 #define PCI_PM_D2_DELAY                200
@@ -294,6 +298,7 @@ struct pci_dev {
        unsigned int    d2_support:1;   /* Low power state D2 is supported */
        unsigned int    no_d1d2:1;      /* D1 and D2 are forbidden */
        unsigned int    no_d3cold:1;    /* D3cold is forbidden */
+       unsigned int    bridge_d3:1;    /* Allow D3 for bridge */
        unsigned int    d3cold_allowed:1;       /* D3cold is allowed by user */
        unsigned int    mmio_always_on:1;       /* disallow turning off io/mem
                                                   decoding during bar sizing */
@@ -320,6 +325,7 @@ struct pci_dev {
         * directly, use the values stored here. They might be different!
         */
        unsigned int    irq;
+       struct cpumask  *irq_affinity;
        struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
 
        bool match_driver;              /* Skip attaching driver */
@@ -1084,6 +1090,8 @@ int pci_back_from_sleep(struct pci_dev *dev);
 bool pci_dev_run_wake(struct pci_dev *dev);
 bool pci_check_pme_status(struct pci_dev *dev);
 void pci_pme_wakeup_bus(struct pci_bus *bus);
+void pci_d3cold_enable(struct pci_dev *dev);
+void pci_d3cold_disable(struct pci_dev *dev);
 
 static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state,
                                  bool enable)
@@ -1115,6 +1123,7 @@ int pci_set_vpd_size(struct pci_dev *dev, size_t len);
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
 resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
 void pci_bus_assign_resources(const struct pci_bus *bus);
+void pci_bus_claim_resources(struct pci_bus *bus);
 void pci_bus_size_bridges(struct pci_bus *bus);
 int pci_claim_resource(struct pci_dev *, int);
 int pci_claim_bridge_resource(struct pci_dev *bridge, int i);
@@ -1144,9 +1153,12 @@ void pci_add_resource(struct list_head *resources, struct resource *res);
 void pci_add_resource_offset(struct list_head *resources, struct resource *res,
                             resource_size_t offset);
 void pci_free_resource_list(struct list_head *resources);
-void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, unsigned int flags);
+void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
+                         unsigned int flags);
 struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n);
 void pci_bus_remove_resources(struct pci_bus *bus);
+int devm_request_pci_bus_resources(struct device *dev,
+                                  struct list_head *resources);
 
 #define pci_bus_for_each_resource(bus, res, i)                         \
        for (i = 0;                                                     \
@@ -1168,6 +1180,7 @@ int pci_register_io_range(phys_addr_t addr, resource_size_t size);
 unsigned long pci_address_to_pio(phys_addr_t addr);
 phys_addr_t pci_pio_to_address(unsigned long pio);
 int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr);
+void pci_unmap_iospace(struct resource *res);
 
 static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar)
 {
@@ -1238,6 +1251,11 @@ resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno);
 int pci_set_vga_state(struct pci_dev *pdev, bool decode,
                      unsigned int command_bits, u32 flags);
 
+#define PCI_IRQ_NOLEGACY       (1 << 0) /* don't use legacy interrupts */
+#define PCI_IRQ_NOMSI          (1 << 1) /* don't use MSI interrupts */
+#define PCI_IRQ_NOMSIX         (1 << 2) /* don't use MSI-X interrupts */
+#define PCI_IRQ_NOAFFINITY     (1 << 3) /* don't auto-assign affinity */
+
 /* kmem_cache style wrapper around pci_alloc_consistent() */
 
 #include <linux/pci-dma.h>
@@ -1285,6 +1303,11 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev,
                return rc;
        return 0;
 }
+int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+               unsigned int max_vecs, unsigned int flags);
+void pci_free_irq_vectors(struct pci_dev *dev);
+int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
+
 #else
 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
 static inline void pci_msi_shutdown(struct pci_dev *dev) { }
@@ -1308,6 +1331,24 @@ static inline int pci_enable_msix_range(struct pci_dev *dev,
 static inline int pci_enable_msix_exact(struct pci_dev *dev,
                      struct msix_entry *entries, int nvec)
 { return -ENOSYS; }
+static inline int pci_alloc_irq_vectors(struct pci_dev *dev,
+               unsigned int min_vecs, unsigned int max_vecs,
+               unsigned int flags)
+{
+       if (min_vecs > 1)
+               return -EINVAL;
+       return 1;
+}
+static inline void pci_free_irq_vectors(struct pci_dev *dev)
+{
+}
+
+static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
+{
+       if (WARN_ON_ONCE(nr > 0))
+               return -EINVAL;
+       return dev->irq;
+}
 #endif
 
 #ifdef CONFIG_PCIEPORTBUS
@@ -1390,12 +1431,13 @@ static inline int pci_domain_nr(struct pci_bus *bus)
 {
        return bus->domain_nr;
 }
-void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent);
+#ifdef CONFIG_ACPI
+int acpi_pci_bus_find_domain_nr(struct pci_bus *bus);
 #else
-static inline void pci_bus_assign_domain_nr(struct pci_bus *bus,
-                                       struct device *parent)
-{
-}
+static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus)
+{ return 0; }
+#endif
+int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent);
 #endif
 
 /* some architectures require additional setup to direct VGA traffic */
@@ -1403,6 +1445,34 @@ typedef int (*arch_set_vga_state_t)(struct pci_dev *pdev, bool decode,
                      unsigned int command_bits, u32 flags);
 void pci_register_set_vga_state(arch_set_vga_state_t func);
 
+static inline int
+pci_request_io_regions(struct pci_dev *pdev, const char *name)
+{
+       return pci_request_selected_regions(pdev,
+                           pci_select_bars(pdev, IORESOURCE_IO), name);
+}
+
+static inline void
+pci_release_io_regions(struct pci_dev *pdev)
+{
+       return pci_release_selected_regions(pdev,
+                           pci_select_bars(pdev, IORESOURCE_IO));
+}
+
+static inline int
+pci_request_mem_regions(struct pci_dev *pdev, const char *name)
+{
+       return pci_request_selected_regions(pdev,
+                           pci_select_bars(pdev, IORESOURCE_MEM), name);
+}
+
+static inline void
+pci_release_mem_regions(struct pci_dev *pdev)
+{
+       return pci_release_selected_regions(pdev,
+                           pci_select_bars(pdev, IORESOURCE_MEM));
+}
+
 #else /* CONFIG_PCI is not enabled */
 
 static inline void pci_set_flags(int flags) { }
@@ -1555,7 +1625,11 @@ static inline const char *pci_name(const struct pci_dev *pdev)
 /* Some archs don't want to expose struct resource to userland as-is
  * in sysfs and /proc
  */
-#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER
+#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER
+void pci_resource_to_user(const struct pci_dev *dev, int bar,
+                         const struct resource *rsrc,
+                         resource_size_t *start, resource_size_t *end);
+#else
 static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
                const struct resource *rsrc, resource_size_t *start,
                resource_size_t *end)
@@ -1707,6 +1781,7 @@ extern u8 pci_cache_line_size;
 
 extern unsigned long pci_hotplug_io_size;
 extern unsigned long pci_hotplug_mem_size;
+extern unsigned long pci_hotplug_bus_size;
 
 /* Architecture-specific versions may override these (weak) */
 void pcibios_disable_device(struct pci_dev *dev);
@@ -1723,7 +1798,7 @@ void pcibios_free_irq(struct pci_dev *dev);
 extern struct dev_pm_ops pcibios_pm_ops;
 #endif
 
-#ifdef CONFIG_PCI_MMCONFIG
+#if defined(CONFIG_PCI_MMCONFIG) || defined(CONFIG_ACPI_MCFG)
 void __init pci_mmcfg_early_init(void);
 void __init pci_mmcfg_late_init(void);
 #else
index f28292d..8ade3eb 100644 (file)
@@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq,
                __entry->data           = data;
        ),
 
-       TP_printk("dst %u vec %u (%s|%s|%s%s)",
-                 (u8)(__entry->address >> 12), (u8)__entry->data,
+       TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+                 (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+                 (u8)__entry->data,
                  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->address & (1<<2)) ? "logical" : "physical",
                  (__entry->data & (1<<15)) ? "level" : "edge",
index 05ebf47..e98bb4c 100644 (file)
@@ -866,6 +866,10 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_S390_USER_INSTR0 130
+#define KVM_CAP_MSI_DEVID 131
+#define KVM_CAP_PPC_HTM 132
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1024,12 +1028,14 @@ struct kvm_one_reg {
        __u64 addr;
 };
 
+#define KVM_MSI_VALID_DEVID    (1U << 0)
 struct kvm_msi {
        __u32 address_lo;
        __u32 address_hi;
        __u32 data;
        __u32 flags;
-       __u8  pad[16];
+       __u32 devid;
+       __u8  pad[12];
 };
 
 struct kvm_arm_device_addr {
@@ -1074,6 +1080,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_FLIC              KVM_DEV_TYPE_FLIC
        KVM_DEV_TYPE_ARM_VGIC_V3,
 #define KVM_DEV_TYPE_ARM_VGIC_V3       KVM_DEV_TYPE_ARM_VGIC_V3
+       KVM_DEV_TYPE_ARM_VGIC_ITS,
+#define KVM_DEV_TYPE_ARM_VGIC_ITS      KVM_DEV_TYPE_ARM_VGIC_ITS
        KVM_DEV_TYPE_MAX,
 };
 
@@ -1313,4 +1321,7 @@ struct kvm_assigned_msix_entry {
        __u16 padding[3];
 };
 
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
+
 #endif /* __LINUX_KVM_H */
index cc02f28..2307d7c 100644 (file)
@@ -709,6 +709,8 @@ config KCOV
        bool "Code coverage for fuzzing"
        depends on ARCH_HAS_KCOV
        select DEBUG_FS
+       select GCC_PLUGINS if !COMPILE_TEST
+       select GCC_PLUGIN_SANCOV if !COMPILE_TEST
        help
          KCOV exposes kernel code coverage information in a form suitable
          for coverage-guided fuzzing (randomized testing).
index 547741f..96b2b2f 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -723,6 +723,7 @@ retry:
        }
        return 0;
 }
+EXPORT_SYMBOL_GPL(fixup_user_fault);
 
 static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
                                                struct mm_struct *mm,
index 958d985..84cbed6 100644 (file)
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
        crypto.o armor.o \
        auth_x.o \
        ceph_fs.o ceph_strings.o ceph_hash.o \
-       pagevec.o snapshot.o
+       pagevec.o snapshot.o string_table.o
 
index 55d2bfe..bddfcf6 100644 (file)
@@ -747,6 +747,8 @@ out:
 static void __exit exit_ceph_lib(void)
 {
        dout("exit_ceph_lib\n");
+       WARN_ON(!ceph_strings_empty());
+
        ceph_osdc_cleanup();
        ceph_msgr_exit();
        ceph_crypto_shutdown();
index 41466cc..7d54e94 100644 (file)
@@ -9,9 +9,9 @@
  */
 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
 {
-       __u32 su = le32_to_cpu(layout->fl_stripe_unit);
-       __u32 sc = le32_to_cpu(layout->fl_stripe_count);
-       __u32 os = le32_to_cpu(layout->fl_object_size);
+       __u32 su = layout->stripe_unit;
+       __u32 sc = layout->stripe_count;
+       __u32 os = layout->object_size;
 
        /* stripe unit, object size must be non-zero, 64k increment */
        if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
@@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
        return 1;
 }
 
+void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+                                 struct ceph_file_layout_legacy *legacy)
+{
+       fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
+       fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
+       fl->object_size = le32_to_cpu(legacy->fl_object_size);
+       fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
+       if (fl->pool_id == 0)
+               fl->pool_id = -1;
+}
+EXPORT_SYMBOL(ceph_file_layout_from_legacy);
+
+void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+                               struct ceph_file_layout_legacy *legacy)
+{
+       legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
+       legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
+       legacy->fl_object_size = cpu_to_le32(fl->object_size);
+       if (fl->pool_id >= 0)
+               legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
+       else
+               legacy->fl_pg_pool = 0;
+}
+EXPORT_SYMBOL(ceph_file_layout_to_legacy);
 
 int ceph_flags_to_mode(int flags)
 {
index e77b04c..c62b2b0 100644 (file)
@@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
        seq_printf(s, "]/%d\t[", t->up.primary);
        for (i = 0; i < t->acting.size; i++)
                seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
-       seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
-                  t->target_oid.name_len, t->target_oid.name, t->flags);
+       seq_printf(s, "]/%d\t", t->acting.primary);
+       if (t->target_oloc.pool_ns) {
+               seq_printf(s, "%*pE/%*pE\t0x%x",
+                       (int)t->target_oloc.pool_ns->len,
+                       t->target_oloc.pool_ns->str,
+                       t->target_oid.name_len, t->target_oid.name, t->flags);
+       } else {
+               seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len,
+                       t->target_oid.name, t->flags);
+       }
        if (t->paused)
                seq_puts(s, "\tP");
 }
index 37c38a7..c83326c 100644 (file)
@@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
 }
 
 const char *ceph_sub_str[] = {
-       [CEPH_SUB_MDSMAP] = "mdsmap",
        [CEPH_SUB_MONMAP] = "monmap",
        [CEPH_SUB_OSDMAP] = "osdmap",
+       [CEPH_SUB_FSMAP]  = "fsmap.user",
+       [CEPH_SUB_MDSMAP] = "mdsmap",
 };
 
 /*
@@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
        case CEPH_MSG_MON_MAP:
        case CEPH_MSG_MDS_MAP:
        case CEPH_MSG_OSD_MAP:
+       case CEPH_MSG_FS_MAP_USER:
                m = ceph_msg_new(type, front_len, GFP_NOFS, false);
                if (!m)
                        return NULL;    /* ENOMEM--return skip == 0 */
index ddec1c1..aaed59a 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 
+#include <linux/ceph/messenger.h>
 #include <linux/ceph/msgpool.h>
 
 static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
index 8946959..b5ec096 100644 (file)
@@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest,
 static void target_destroy(struct ceph_osd_request_target *t)
 {
        ceph_oid_destroy(&t->base_oid);
+       ceph_oloc_destroy(&t->base_oloc);
        ceph_oid_destroy(&t->target_oid);
+       ceph_oloc_destroy(&t->target_oloc);
 }
 
 /*
@@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
+static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc)
+{
+       return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
+}
+
 int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
 {
        struct ceph_osd_client *osdc = req->r_osdc;
@@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
        int msg_size;
 
        WARN_ON(ceph_oid_empty(&req->r_base_oid));
+       WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
 
        /* create request message */
        msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
        msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
-       msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+       msg_size += CEPH_ENCODING_START_BLK_LEN +
+                       ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
        msg_size += 1 + 8 + 4 + 4; /* pgid */
        msg_size += 4 + req->r_base_oid.name_len; /* oid */
        msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
@@ -932,7 +941,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
                osd_req_op_init(req, which, opcode, 0);
        } else {
-               u32 object_size = le32_to_cpu(layout->fl_object_size);
+               u32 object_size = layout->object_size;
                u32 object_base = off - objoff;
                if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
                        if (truncate_size <= object_base) {
@@ -948,7 +957,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        }
 
        req->r_flags = flags;
-       req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+       req->r_base_oloc.pool = layout->pool_id;
+       req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
        ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
        req->r_snapid = vino.snap;
@@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
        p += sizeof(req->r_replay_version);
 
        /* oloc */
-       ceph_encode_8(&p, 4);
-       ceph_encode_8(&p, 4);
-       ceph_encode_32(&p, 8 + 4 + 4);
+       ceph_start_encoding(&p, 5, 4,
+                           ceph_oloc_encoding_size(&req->r_t.target_oloc));
        ceph_encode_64(&p, req->r_t.target_oloc.pool);
        ceph_encode_32(&p, -1); /* preferred */
        ceph_encode_32(&p, 0); /* key len */
+       if (req->r_t.target_oloc.pool_ns)
+               ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str,
+                                  req->r_t.target_oloc.pool_ns->len);
+       else
+               ceph_encode_32(&p, 0);
 
        /* pgid */
        ceph_encode_8(&p, 1);
@@ -2594,9 +2608,22 @@ static int ceph_oloc_decode(void **p, void *end,
        }
 
        if (struct_v >= 5) {
+               bool changed = false;
+
                len = ceph_decode_32(p);
                if (len > 0) {
-                       pr_warn("ceph_object_locator::nspace is set\n");
+                       ceph_decode_need(p, end, len, e_inval);
+                       if (!oloc->pool_ns ||
+                           ceph_compare_string(oloc->pool_ns, *p, len))
+                               changed = true;
+                       *p += len;
+               } else {
+                       if (oloc->pool_ns)
+                               changed = true;
+               }
+               if (changed) {
+                       /* redirect changes namespace */
+                       pr_warn("ceph_object_locator::nspace is changed\n");
                        goto e_inval;
                }
        }
@@ -2806,7 +2833,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
                goto out_unlock_session;
        }
 
+       m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
        ret = decode_MOSDOpReply(msg, &m);
+       m.redirect.oloc.pool_ns = NULL;
        if (ret) {
                pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
                       req->r_tid, ret);
@@ -2835,7 +2864,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
                unlink_request(osd, req);
                mutex_unlock(&osd->lock);
 
-               ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+               /*
+                * Not ceph_oloc_copy() - changing pool_ns is not
+                * supported.
+                */
+               req->r_t.target_oloc.pool = m.redirect.oloc.pool;
                req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
                req->r_tid = 0;
                __submit_request(req, false);
index 7e480bf..d243688 100644 (file)
@@ -1510,6 +1510,24 @@ bad:
        return ERR_PTR(err);
 }
 
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+                   const struct ceph_object_locator *src)
+{
+       WARN_ON(!ceph_oloc_empty(dest));
+       WARN_ON(dest->pool_ns); /* empty() only covers ->pool */
+
+       dest->pool = src->pool;
+       if (src->pool_ns)
+               dest->pool_ns = ceph_get_string(src->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_copy);
+
+void ceph_oloc_destroy(struct ceph_object_locator *oloc)
+{
+       ceph_put_string(oloc->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_destroy);
+
 void ceph_oid_copy(struct ceph_object_id *dest,
                   const struct ceph_object_id *src)
 {
@@ -1770,9 +1788,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
                                   u64 *ono,
                                   u64 *oxoff, u64 *oxlen)
 {
-       u32 osize = le32_to_cpu(layout->fl_object_size);
-       u32 su = le32_to_cpu(layout->fl_stripe_unit);
-       u32 sc = le32_to_cpu(layout->fl_stripe_count);
+       u32 osize = layout->object_size;
+       u32 su = layout->stripe_unit;
+       u32 sc = layout->stripe_count;
        u32 bl, stripeno, stripepos, objsetno;
        u32 su_per_object;
        u64 t, su_offset;
@@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
        if (!pi)
                return -ENOENT;
 
-       raw_pgid->pool = oloc->pool;
-       raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
-                                      oid->name_len);
-
-       dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
-            raw_pgid->pool, raw_pgid->seed);
+       if (!oloc->pool_ns) {
+               raw_pgid->pool = oloc->pool;
+               raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+                                            oid->name_len);
+               dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+                    raw_pgid->pool, raw_pgid->seed);
+       } else {
+               char stack_buf[256];
+               char *buf = stack_buf;
+               int nsl = oloc->pool_ns->len;
+               size_t total = nsl + 1 + oid->name_len;
+
+               if (total > sizeof(stack_buf)) {
+                       buf = kmalloc(total, GFP_NOIO);
+                       if (!buf)
+                               return -ENOMEM;
+               }
+               memcpy(buf, oloc->pool_ns->str, nsl);
+               buf[nsl] = '\037';
+               memcpy(buf + nsl + 1, oid->name, oid->name_len);
+               raw_pgid->pool = oloc->pool;
+               raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
+               if (buf != stack_buf)
+                       kfree(buf);
+               dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
+                    oid->name, nsl, oloc->pool_ns->str,
+                    raw_pgid->pool, raw_pgid->seed);
+       }
        return 0;
 }
 EXPORT_SYMBOL(ceph_object_locator_to_pg);
diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
new file mode 100644 (file)
index 0000000..ca53c83
--- /dev/null
@@ -0,0 +1,111 @@
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/ceph/string_table.h>
+
+static DEFINE_SPINLOCK(string_tree_lock);
+static struct rb_root string_tree = RB_ROOT;
+
+struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
+{
+       struct ceph_string *cs, *exist;
+       struct rb_node **p, *parent;
+       int ret;
+
+       exist = NULL;
+       spin_lock(&string_tree_lock);
+       p = &string_tree.rb_node;
+       while (*p) {
+               exist = rb_entry(*p, struct ceph_string, node);
+               ret = ceph_compare_string(exist, str, len);
+               if (ret > 0)
+                       p = &(*p)->rb_left;
+               else if (ret < 0)
+                       p = &(*p)->rb_right;
+               else
+                       break;
+               exist = NULL;
+       }
+       if (exist && !kref_get_unless_zero(&exist->kref)) {
+               rb_erase(&exist->node, &string_tree);
+               RB_CLEAR_NODE(&exist->node);
+               exist = NULL;
+       }
+       spin_unlock(&string_tree_lock);
+       if (exist)
+               return exist;
+
+       cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
+       if (!cs)
+               return NULL;
+
+       kref_init(&cs->kref);
+       cs->len = len;
+       memcpy(cs->str, str, len);
+       cs->str[len] = 0;
+
+retry:
+       exist = NULL;
+       parent = NULL;
+       p = &string_tree.rb_node;
+       spin_lock(&string_tree_lock);
+       while (*p) {
+               parent = *p;
+               exist = rb_entry(*p, struct ceph_string, node);
+               ret = ceph_compare_string(exist, str, len);
+               if (ret > 0)
+                       p = &(*p)->rb_left;
+               else if (ret < 0)
+                       p = &(*p)->rb_right;
+               else
+                       break;
+               exist = NULL;
+       }
+       ret = 0;
+       if (!exist) {
+               rb_link_node(&cs->node, parent, p);
+               rb_insert_color(&cs->node, &string_tree);
+       } else if (!kref_get_unless_zero(&exist->kref)) {
+               rb_erase(&exist->node, &string_tree);
+               RB_CLEAR_NODE(&exist->node);
+               ret = -EAGAIN;
+       }
+       spin_unlock(&string_tree_lock);
+       if (ret == -EAGAIN)
+               goto retry;
+
+       if (exist) {
+               kfree(cs);
+               cs = exist;
+       }
+
+       return cs;
+}
+EXPORT_SYMBOL(ceph_find_or_create_string);
+
+static void ceph_free_string(struct rcu_head *head)
+{
+       struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
+       kfree(cs);
+}
+
+void ceph_release_string(struct kref *ref)
+{
+       struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
+
+       spin_lock(&string_tree_lock);
+       if (!RB_EMPTY_NODE(&cs->node)) {
+               rb_erase(&cs->node, &string_tree);
+               RB_CLEAR_NODE(&cs->node);
+       }
+       spin_unlock(&string_tree_lock);
+
+       call_rcu(&cs->rcu, ceph_free_string);
+}
+EXPORT_SYMBOL(ceph_release_string);
+
+bool ceph_strings_empty(void)
+{
+       return RB_EMPTY_ROOT(&string_tree);
+}
index 0f82314..15b196f 100644 (file)
@@ -202,7 +202,7 @@ hdr-inst := -f $(srctree)/scripts/Makefile.headersinst obj
 # Prefix -I with $(srctree) if it is not an absolute path.
 # skip if -I has no parameter
 addtree = $(if $(patsubst -I%,%,$(1)), \
-$(if $(filter-out -I/%,$(1)),$(patsubst -I%,-I$(srctree)/%,$(1))) $(1))
+$(if $(filter-out -I/% -I./% -I../%,$(1)),$(patsubst -I%,-I$(srctree)/%,$(1)),$(1)))
 
 # Find all -I options and call addtree
 flags = $(foreach o,$($(1)),$(if $(filter -I%,$(o)),$(call addtree,$(o)),$(o)))
index 822ab4a..1d80897 100644 (file)
@@ -47,4 +47,4 @@ subdir-$(CONFIG_DTC)         += dtc
 subdir-$(CONFIG_GDB_SCRIPTS) += gdb
 
 # Let clean descend into subdirs
-subdir-        += basic kconfig package
+subdir-        += basic kconfig package gcc-plugins
index 0d1ca5b..11602e5 100644 (file)
@@ -60,7 +60,7 @@ endif
 endif
 
 # Do not include host rules unless needed
-ifneq ($(hostprogs-y)$(hostprogs-m),)
+ifneq ($(hostprogs-y)$(hostprogs-m)$(hostlibs-y)$(hostlibs-m)$(hostcxxlibs-y)$(hostcxxlibs-m),)
 include scripts/Makefile.host
 endif
 
index 55c96cb..50616ea 100644 (file)
@@ -38,7 +38,9 @@ subdir-ymn    := $(addprefix $(obj)/,$(subdir-ymn))
 __clean-files  := $(extra-y) $(extra-m) $(extra-)       \
                   $(always) $(targets) $(clean-files)   \
                   $(host-progs)                         \
-                  $(hostprogs-y) $(hostprogs-m) $(hostprogs-)
+                  $(hostprogs-y) $(hostprogs-m) $(hostprogs-) \
+                  $(hostlibs-y) $(hostlibs-m) $(hostlibs-) \
+                  $(hostcxxlibs-y) $(hostcxxlibs-m)
 
 __clean-files   := $(filter-out $(no-clean-files), $(__clean-files))
 
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
new file mode 100644 (file)
index 0000000..5e22b60
--- /dev/null
@@ -0,0 +1,43 @@
+ifdef CONFIG_GCC_PLUGINS
+  __PLUGINCC := $(call cc-ifversion, -ge, 0408, $(HOSTCXX), $(HOSTCC))
+  PLUGINCC := $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-plugin.sh "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)")
+
+  SANCOV_PLUGIN := -fplugin=$(objtree)/scripts/gcc-plugins/sancov_plugin.so
+
+  gcc-plugin-$(CONFIG_GCC_PLUGIN_CYC_COMPLEXITY)       += cyc_complexity_plugin.so
+
+  ifdef CONFIG_GCC_PLUGIN_SANCOV
+    ifeq ($(CFLAGS_KCOV),)
+      # It is needed because of the gcc-plugin.sh and gcc version checks.
+      gcc-plugin-$(CONFIG_GCC_PLUGIN_SANCOV)           += sancov_plugin.so
+
+      ifneq ($(PLUGINCC),)
+        CFLAGS_KCOV := $(SANCOV_PLUGIN)
+      else
+        $(warning warning: cannot use CONFIG_KCOV: -fsanitize-coverage=trace-pc is not supported by compiler)
+      endif
+    endif
+  endif
+
+  GCC_PLUGINS_CFLAGS := $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y))
+
+  export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN SANCOV_PLUGIN
+
+  ifeq ($(PLUGINCC),)
+    ifneq ($(GCC_PLUGINS_CFLAGS),)
+      ifeq ($(call cc-ifversion, -ge, 0405, y), y)
+        PLUGINCC := $(shell $(CONFIG_SHELL) -x $(srctree)/scripts/gcc-plugin.sh "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)")
+        $(warning warning: your gcc installation does not support plugins, perhaps the necessary headers are missing?)
+      else
+        $(warning warning: your gcc version does not support plugins, you should upgrade it to gcc 4.5 at least)
+      endif
+    endif
+  else
+    # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
+    GCC_PLUGINS_CFLAGS := $(filter-out $(SANCOV_PLUGIN), $(GCC_PLUGINS_CFLAGS))
+  endif
+
+  KBUILD_CFLAGS += $(GCC_PLUGINS_CFLAGS)
+  GCC_PLUGIN := $(gcc-plugin-y)
+
+endif
index 133edfa..45b5b1a 100644 (file)
 # Will compile qconf as a C++ program, and menu as a C program.
 # They are linked as C++ code to the executable qconf
 
+# hostcc-option
+# Usage: cflags-y += $(call hostcc-option,-march=winchip-c6,-march=i586)
+
+hostcc-option = $(call try-run,\
+       $(HOSTCC) $(HOSTCFLAGS) $(HOST_EXTRACFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
+
 __hostprogs := $(sort $(hostprogs-y) $(hostprogs-m))
+host-cshlib := $(sort $(hostlibs-y) $(hostlibs-m))
+host-cxxshlib := $(sort $(hostcxxlibs-y) $(hostcxxlibs-m))
 
 # C code
 # Executables compiled from a single .c file
@@ -42,6 +50,10 @@ host-cxxmulti        := $(foreach m,$(__hostprogs),$(if $($(m)-cxxobjs),$(m)))
 # C++ Object (.o) files compiled from .cc files
 host-cxxobjs   := $(sort $(foreach m,$(host-cxxmulti),$($(m)-cxxobjs)))
 
+# Object (.o) files used by the shared libaries
+host-cshobjs   := $(sort $(foreach m,$(host-cshlib),$($(m:.so=-objs))))
+host-cxxshobjs := $(sort $(foreach m,$(host-cxxshlib),$($(m:.so=-objs))))
+
 # output directory for programs/.o files
 # hostprogs-y := tools/build may have been specified.
 # Retrieve also directory of .o files from prog-objs or prog-cxxobjs notation
@@ -56,6 +68,10 @@ host-cmulti  := $(addprefix $(obj)/,$(host-cmulti))
 host-cobjs     := $(addprefix $(obj)/,$(host-cobjs))
 host-cxxmulti  := $(addprefix $(obj)/,$(host-cxxmulti))
 host-cxxobjs   := $(addprefix $(obj)/,$(host-cxxobjs))
+host-cshlib    := $(addprefix $(obj)/,$(host-cshlib))
+host-cxxshlib  := $(addprefix $(obj)/,$(host-cxxshlib))
+host-cshobjs   := $(addprefix $(obj)/,$(host-cshobjs))
+host-cxxshobjs := $(addprefix $(obj)/,$(host-cxxshobjs))
 host-objdirs    := $(addprefix $(obj)/,$(host-objdirs))
 
 obj-dirs += $(host-objdirs)
@@ -124,5 +140,42 @@ quiet_cmd_host-cxxobjs     = HOSTCXX $@
 $(host-cxxobjs): $(obj)/%.o: $(src)/%.cc FORCE
        $(call if_changed_dep,host-cxxobjs)
 
+# Compile .c file, create position independent .o file
+# host-cshobjs -> .o
+quiet_cmd_host-cshobjs = HOSTCC  -fPIC $@
+      cmd_host-cshobjs = $(HOSTCC) $(hostc_flags) -fPIC -c -o $@ $<
+$(host-cshobjs): $(obj)/%.o: $(src)/%.c FORCE
+       $(call if_changed_dep,host-cshobjs)
+
+# Compile .c file, create position independent .o file
+# Note that plugin capable gcc versions can be either C or C++ based
+# therefore plugin source files have to be compilable in both C and C++ mode.
+# This is why a C++ compiler is invoked on a .c file.
+# host-cxxshobjs -> .o
+quiet_cmd_host-cxxshobjs       = HOSTCXX -fPIC $@
+      cmd_host-cxxshobjs       = $(HOSTCXX) $(hostcxx_flags) -fPIC -c -o $@ $<
+$(host-cxxshobjs): $(obj)/%.o: $(src)/%.c FORCE
+       $(call if_changed_dep,host-cxxshobjs)
+
+# Link a shared library, based on position independent .o files
+# *.o -> .so shared library (host-cshlib)
+quiet_cmd_host-cshlib  = HOSTLLD -shared $@
+      cmd_host-cshlib  = $(HOSTCC) $(HOSTLDFLAGS) -shared -o $@ \
+                         $(addprefix $(obj)/,$($(@F:.so=-objs))) \
+                         $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F))
+$(host-cshlib): FORCE
+       $(call if_changed,host-cshlib)
+$(call multi_depend, $(host-cshlib), .so, -objs)
+
+# Link a shared library, based on position independent .o files
+# *.o -> .so shared library (host-cxxshlib)
+quiet_cmd_host-cxxshlib        = HOSTLLD -shared $@
+      cmd_host-cxxshlib        = $(HOSTCXX) $(HOSTLDFLAGS) -shared -o $@ \
+                         $(addprefix $(obj)/,$($(@F:.so=-objs))) \
+                         $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F))
+$(host-cxxshlib): FORCE
+       $(call if_changed,host-cxxshlib)
+$(call multi_depend, $(host-cxxshlib), .so, -objs)
+
 targets += $(host-csingle)  $(host-cmulti) $(host-cobjs)\
-          $(host-cxxmulti) $(host-cxxobjs)
+          $(host-cxxmulti) $(host-cxxobjs) $(host-cshlib) $(host-cshobjs) $(host-cxxshlib) $(host-cxxshobjs)
index 76494e1..0a07f90 100644 (file)
@@ -155,9 +155,10 @@ else
 # $(call addtree,-I$(obj)) locates .h files in srctree, from generated .c files
 #   and locates generated .h files
 # FIXME: Replace both with specific CFLAGS* statements in the makefiles
-__c_flags      = $(call addtree,-I$(obj)) $(call flags,_c_flags)
-__a_flags      =                          $(call flags,_a_flags)
-__cpp_flags     =                          $(call flags,_cpp_flags)
+__c_flags      = $(if $(obj),-I$(srctree)/$(src) -I$(obj)) \
+                 $(call flags,_c_flags)
+__a_flags      = $(call flags,_a_flags)
+__cpp_flags     = $(call flags,_cpp_flags)
 endif
 
 c_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
index af187e6..c3d7eef 100644 (file)
@@ -29,7 +29,8 @@ int main(int argc, char *argv[])
        } while (ch != EOF);
 
        if (argc > 1)
-               printf("\t;\n\nconst int %s_size = %d;\n", argv[1], total);
+               printf("\t;\n\n#include <linux/types.h>\n\nconst size_t %s_size = %d;\n",
+                      argv[1], total);
 
        return 0;
 }
index dd85a45..c92c152 100755 (executable)
@@ -1,14 +1,24 @@
 #!/bin/bash
-
+# Linux kernel coccicheck
+#
+# Read Documentation/coccinelle.txt
 #
 # This script requires at least spatch
 # version 1.0.0-rc11.
-#
 
+DIR="$(dirname $(readlink -f $0))/.."
 SPATCH="`which ${SPATCH:=spatch}`"
 
-trap kill_running SIGTERM SIGINT
-declare -a SPATCH_PID
+if [ ! -x "$SPATCH" ]; then
+    echo 'spatch is part of the Coccinelle project and is available at http://coccinelle.lip6.fr/'
+    exit 1
+fi
+
+SPATCH_VERSION=$($SPATCH --version | head -1 | awk '{print $3}')
+SPATCH_VERSION_NUM=$(echo $SPATCH_VERSION | ${DIR}/scripts/ld-version.sh)
+
+USE_JOBS="no"
+$SPATCH --help | grep "\-\-jobs" > /dev/null && USE_JOBS="yes"
 
 # The verbosity may be set by the environmental parameter V=
 # as for example with 'make V=1 coccicheck'
@@ -25,7 +35,28 @@ else
        NPROC="$J"
 fi
 
-FLAGS="$SPFLAGS --very-quiet"
+FLAGS="--very-quiet"
+
+# You can use SPFLAGS to append extra arguments to coccicheck or override any
+# heuristics done in this file as Coccinelle accepts the last options when
+# options conflict.
+#
+# A good example for use of SPFLAGS is if you want to debug your cocci script,
+# you can for instance use the following:
+#
+# $ export COCCI=scripts/coccinelle/misc/irqf_oneshot.cocci
+# $ make coccicheck MODE=report DEBUG_FILE="all.err" SPFLAGS="--profile --show-trying" M=./drivers/mfd/arizona-irq.c
+#
+# "--show-trying" should show you what rule is being processed as it goes to
+# stdout, you do not need a debug file for that. The profile output will be
+# be sent to stdout, if you provide a DEBUG_FILE the profiling data can be
+# inspected there.
+#
+# --profile will not output if --very-quiet is used, so avoid it.
+echo $SPFLAGS | egrep -e "--profile|--show-trying" 2>&1 > /dev/null
+if [ $? -eq 0 ]; then
+       FLAGS="--quiet"
+fi
 
 # spatch only allows include directories with the syntax "-I include"
 # while gcc also allows "-Iinclude" and "-include include"
@@ -51,9 +82,14 @@ if [ "$KBUILD_EXTMOD" != "" ] ; then
     OPTIONS="--patch $srctree $OPTIONS"
 fi
 
-if [ ! -x "$SPATCH" ]; then
-    echo 'spatch is part of the Coccinelle project and is available at http://coccinelle.lip6.fr/'
-    exit 1
+# You can override by using SPFLAGS
+if [ "$USE_JOBS" = "no" ]; then
+       trap kill_running SIGTERM SIGINT
+       declare -a SPATCH_PID
+elif [ "$NPROC" != "1" ]; then
+       # Using 0 should work as well, refer to _SC_NPROCESSORS_ONLN use on
+       # https://github.com/rdicosmo/parmap/blob/master/setcore_stubs.c
+       OPTIONS="$OPTIONS --jobs $NPROC --chunksize 1"
 fi
 
 if [ "$MODE" = "" ] ; then
@@ -72,7 +108,7 @@ if [ "$MODE" = "chain" ] ; then
        echo 'All available modes will be tried (in that order): patch, report, context, org'
     fi
 elif [ "$MODE" = "report" -o "$MODE" = "org" ] ; then
-    FLAGS="$FLAGS --no-show-diff"
+    FLAGS="--no-show-diff $FLAGS"
 fi
 
 if [ "$ONLINE" = "0" ] ; then
@@ -82,7 +118,26 @@ if [ "$ONLINE" = "0" ] ; then
     echo ''
 fi
 
-run_cmd() {
+run_cmd_parmap() {
+       if [ $VERBOSE -ne 0 ] ; then
+               echo "Running ($NPROC in parallel): $@"
+       fi
+       if [ "$DEBUG_FILE" != "/dev/null" -a "$DEBUG_FILE" != "" ]; then
+               if [ -f $DEBUG_FILE ]; then
+                       echo "Debug file $DEBUG_FILE exists, bailing"
+                       exit
+               fi
+       else
+               DEBUG_FILE="/dev/null"
+       fi
+       $@ 2>$DEBUG_FILE
+       if [[ $? -ne 0 ]]; then
+               echo "coccicheck failed"
+               exit $?
+       fi
+}
+
+run_cmd_old() {
        local i
        if [ $VERBOSE -ne 0 ] ; then
                echo "Running ($NPROC in parallel): $@"
@@ -97,6 +152,14 @@ run_cmd() {
        wait
 }
 
+run_cmd() {
+       if [ "$USE_JOBS" = "yes" ]; then
+               run_cmd_parmap $@
+       else
+               run_cmd_old $@
+       fi
+}
+
 kill_running() {
        for i in $(seq 0 $(( NPROC - 1 )) ); do
                if [ $VERBOSE -eq 2 ] ; then
@@ -106,10 +169,23 @@ kill_running() {
        done
 }
 
+# You can override heuristics with SPFLAGS, these must always go last
+OPTIONS="$OPTIONS $SPFLAGS"
+
 coccinelle () {
     COCCI="$1"
 
     OPT=`grep "Option" $COCCI | cut -d':' -f2`
+    REQ=`grep "Requires" $COCCI | cut -d':' -f2 | sed "s| ||"`
+    REQ_NUM=$(echo $REQ | ${DIR}/scripts/ld-version.sh)
+    if [ "$REQ_NUM" != "0" ] ; then
+           if [ "$SPATCH_VERSION_NUM" -lt "$REQ_NUM" ] ; then
+                   echo "Skipping coccinele SmPL patch: $COCCI"
+                   echo "You have coccinelle:           $SPATCH_VERSION"
+                   echo "This SmPL patch requires:      $REQ"
+                   return
+           fi
+    fi
 
 #   The option '--parse-cocci' can be used to syntactically check the SmPL files.
 #
index 3d93490..c990d2c 100644 (file)
@@ -29,7 +29,23 @@ expression x;
 @@
 
 (
+ x = devm_kmalloc(...)
+|
+ x = devm_kvasprintf(...)
+|
+ x = devm_kasprintf(...)
+|
  x = devm_kzalloc(...)
+|
+ x = devm_kmalloc_array(...)
+|
+ x = devm_kcalloc(...)
+|
+ x = devm_kstrdup(...)
+|
+ x = devm_kmemdup(...)
+|
+ x = devm_get_free_pages(...)
 |
  x = devm_request_irq(...)
 |
@@ -48,6 +64,16 @@ position p;
 (
 * kfree@p(x)
 |
+* kzfree@p(x)
+|
+* __krealloc@p(x, ...)
+|
+* krealloc@p(x, ...)
+|
+* free_pages@p(x, ...)
+|
+* free_page@p(x)
+|
 * free_irq@p(x)
 |
 * iounmap@p(x)
index 52bd235..14a4cd9 100644 (file)
@@ -19,6 +19,8 @@ expression E;
 - if (E != NULL)
 (
   kfree(E);
+|
+  kzfree(E);
 |
   debugfs_remove(E);
 |
@@ -39,7 +41,7 @@ position p;
 @@
 
 * if (E != NULL)
-*      \(kfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|
+*      \(kfree@p\|kzfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|
 *         usb_free_urb@p\|kmem_cache_destroy@p\|mempool_destroy@p\|
 *         dma_pool_destroy@p\)(E);
 
index 577b780..ac438da 100644 (file)
@@ -20,7 +20,11 @@ expression E;
 position p1;
 @@
 
-kfree@p1(E)
+(
+* kfree@p1(E)
+|
+* kzfree@p1(E)
+)
 
 @print expression@
 constant char [] c;
@@ -60,7 +64,11 @@ position ok;
 @@
 
 while (1) { ...
-  kfree@ok(E)
+(
+* kfree@ok(E)
+|
+* kzfree@ok(E)
+)
   ... when != break;
       when != goto l;
       when forall
@@ -74,7 +82,11 @@ statement S;
 position free.p1!=loop.ok,p2!={print.p,sz.p};
 @@
 
-kfree@p1(E,...)
+(
+* kfree@p1(E,...)
+|
+* kzfree@p1(E,...)
+)
 ...
 (
  iter(...,subE,...) S // no use
index ce8aacc..d46063b 100644 (file)
@@ -16,7 +16,11 @@ identifier f;
 position p;
 @@
 
+(
 * kfree@p(&e->f)
+|
+* kzfree@p(&e->f)
+)
 
 @script:python depends on org@
 p << r.p;
@@ -28,5 +32,5 @@ cocci.print_main("kfree",p)
 p << r.p;
 @@
 
-msg = "ERROR: kfree of structure field"
+msg = "ERROR: invalid free of structure field"
 coccilib.report.print_report(p[0],msg)
index 38ab744..a36c16d 100644 (file)
@@ -5,8 +5,11 @@
 // Copyright: (C) 2015 Julia Lawall, Inria. GPLv2.
 // URL: http://coccinelle.lip6.fr/
 // Options: --no-includes --include-headers
+// Requires: 1.0.4
 // Keywords: for_each_child_of_node, etc.
 
+// This uses a conjunction, which requires at least coccinelle >= 1.0.4
+
 virtual patch
 virtual context
 virtual org
index 80a831c..007f0de 100644 (file)
@@ -16,6 +16,7 @@ virtual patch
 @depends on patch@
 expression *x;
 expression f;
+expression i;
 type T;
 @@
 
@@ -30,15 +31,26 @@ f(...,(T)(x),...,sizeof(
 + *x
    ),...)
 |
-f(...,sizeof(x),...,(T)(
+f(...,sizeof(
+- x
++ *x
+   ),...,(T)(x),...)
+|
+f(...,(T)(x),...,i*sizeof(
 - x
 + *x
    ),...)
+|
+f(...,i*sizeof(
+- x
++ *x
+   ),...,(T)(x),...)
 )
 
 @r depends on !patch@
 expression *x;
 expression f;
+expression i;
 position p;
 type T;
 @@
@@ -49,6 +61,10 @@ type T;
 *f(...,(T)(x),...,sizeof@p(x),...)
 |
 *f(...,sizeof@p(x),...,(T)(x),...)
+|
+*f(...,(T)(x),...,i*sizeof@p(x),...)
+|
+*f(...,i*sizeof@p(x),...,(T)(x),...)
 )
 
 @script:python depends on org@
diff --git a/scripts/gcc-plugin.sh b/scripts/gcc-plugin.sh
new file mode 100755 (executable)
index 0000000..fb92075
--- /dev/null
@@ -0,0 +1,51 @@
+#!/bin/sh
+srctree=$(dirname "$0")
+gccplugins_dir=$($3 -print-file-name=plugin)
+plugincc=$($1 -E -x c++ - -o /dev/null -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 <<EOF
+#include "gcc-common.h"
+#if BUILDING_GCC_VERSION >= 4008 || defined(ENABLE_BUILD_WITH_CXX)
+#warning $2 CXX
+#else
+#warning $1 CC
+#endif
+EOF
+)
+
+if [ $? -ne 0 ]
+then
+       exit 1
+fi
+
+case "$plugincc" in
+       *"$1 CC"*)
+               echo "$1"
+               exit 0
+               ;;
+
+       *"$2 CXX"*)
+               # the c++ compiler needs another test, see below
+               ;;
+
+       *)
+               exit 1
+               ;;
+esac
+
+# we need a c++ compiler that supports the designated initializer GNU extension
+plugincc=$($2 -c -x c++ -std=gnu++98 - -fsyntax-only -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 <<EOF
+#include "gcc-common.h"
+class test {
+public:
+       int test;
+} test = {
+       .test = 1
+};
+EOF
+)
+
+if [ $? -eq 0 ]
+then
+       echo "$2"
+       exit 0
+fi
+exit 1
diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile
new file mode 100644 (file)
index 0000000..88c8ec4
--- /dev/null
@@ -0,0 +1,27 @@
+GCC_PLUGINS_DIR := $(shell $(CC) -print-file-name=plugin)
+
+ifeq ($(PLUGINCC),$(HOSTCC))
+  HOSTLIBS := hostlibs
+  HOST_EXTRACFLAGS += -I$(GCC_PLUGINS_DIR)/include -I$(src) -std=gnu99 -ggdb
+  export HOST_EXTRACFLAGS
+else
+  HOSTLIBS := hostcxxlibs
+  HOST_EXTRACXXFLAGS += -I$(GCC_PLUGINS_DIR)/include -I$(src) -std=gnu++98 -fno-rtti
+  HOST_EXTRACXXFLAGS += -fno-exceptions -fasynchronous-unwind-tables -ggdb
+  HOST_EXTRACXXFLAGS += -Wno-narrowing -Wno-unused-variable
+  export HOST_EXTRACXXFLAGS
+endif
+
+export GCCPLUGINS_DIR HOSTLIBS
+
+ifneq ($(CFLAGS_KCOV), $(SANCOV_PLUGIN))
+  GCC_PLUGIN := $(filter-out $(SANCOV_PLUGIN), $(GCC_PLUGIN))
+endif
+
+$(HOSTLIBS)-y := $(GCC_PLUGIN)
+always := $($(HOSTLIBS)-y)
+
+cyc_complexity_plugin-objs := cyc_complexity_plugin.o
+sancov_plugin-objs := sancov_plugin.o
+
+clean-files += *.so
diff --git a/scripts/gcc-plugins/cyc_complexity_plugin.c b/scripts/gcc-plugins/cyc_complexity_plugin.c
new file mode 100644 (file)
index 0000000..34df974
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2011-2016 by Emese Revfy <re.emese@gmail.com>
+ * Licensed under the GPL v2, or (at your option) v3
+ *
+ * Homepage:
+ * https://github.com/ephox-gcc-plugins/cyclomatic_complexity
+ *
+ * http://en.wikipedia.org/wiki/Cyclomatic_complexity
+ * The complexity M is then defined as:
+ * M = E - N + 2P
+ * where
+ *
+ *  E = the number of edges of the graph
+ *  N = the number of nodes of the graph
+ *  P = the number of connected components (exit nodes).
+ *
+ * Usage (4.5 - 5):
+ * $ make clean; make run
+ */
+
+#include "gcc-common.h"
+
+int plugin_is_GPL_compatible;
+
+static struct plugin_info cyc_complexity_plugin_info = {
+       .version        = "20160225",
+       .help           = "Cyclomatic Complexity\n",
+};
+
+static unsigned int cyc_complexity_execute(void)
+{
+       int complexity;
+       expanded_location xloc;
+
+       /* M = E - N + 2P */
+       complexity = n_edges_for_fn(cfun) - n_basic_blocks_for_fn(cfun) + 2;
+
+       xloc = expand_location(DECL_SOURCE_LOCATION(current_function_decl));
+       fprintf(stderr, "Cyclomatic Complexity %d %s:%s\n", complexity,
+               xloc.file, DECL_NAME_POINTER(current_function_decl));
+
+       return 0;
+}
+
+#define PASS_NAME cyc_complexity
+
+#define NO_GATE
+#define TODO_FLAGS_FINISH TODO_dump_func
+
+#include "gcc-generate-gimple-pass.h"
+
+int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gcc_version *version)
+{
+       const char * const plugin_name = plugin_info->base_name;
+       struct register_pass_info cyc_complexity_pass_info;
+
+       cyc_complexity_pass_info.pass                           = make_cyc_complexity_pass();
+       cyc_complexity_pass_info.reference_pass_name            = "ssa";
+       cyc_complexity_pass_info.ref_pass_instance_number       = 1;
+       cyc_complexity_pass_info.pos_op                         = PASS_POS_INSERT_AFTER;
+
+       if (!plugin_default_version_check(version, &gcc_version)) {
+               error(G_("incompatible gcc/plugin versions"));
+               return 1;
+       }
+
+       register_callback(plugin_name, PLUGIN_INFO, NULL,
+                               &cyc_complexity_plugin_info);
+       register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL,
+                               &cyc_complexity_pass_info);
+
+       return 0;
+}
diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h
new file mode 100644 (file)
index 0000000..172850b
--- /dev/null
@@ -0,0 +1,830 @@
+#ifndef GCC_COMMON_H_INCLUDED
+#define GCC_COMMON_H_INCLUDED
+
+#include "bversion.h"
+#if BUILDING_GCC_VERSION >= 6000
+#include "gcc-plugin.h"
+#else
+#include "plugin.h"
+#endif
+#include "plugin-version.h"
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "line-map.h"
+#include "input.h"
+#include "tree.h"
+
+#include "tree-inline.h"
+#include "version.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "flags.h"
+#include "hard-reg-set.h"
+#include "output.h"
+#include "except.h"
+#include "function.h"
+#include "toplev.h"
+#include "basic-block.h"
+#include "intl.h"
+#include "ggc.h"
+#include "timevar.h"
+
+#include "params.h"
+
+#if BUILDING_GCC_VERSION <= 4009
+#include "pointer-set.h"
+#else
+#include "hash-map.h"
+#endif
+
+#include "emit-rtl.h"
+#include "debug.h"
+#include "target.h"
+#include "langhooks.h"
+#include "cfgloop.h"
+#include "cgraph.h"
+#include "opts.h"
+
+#if BUILDING_GCC_VERSION == 4005
+#include <sys/mman.h>
+#endif
+
+#if BUILDING_GCC_VERSION >= 4007
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#endif
+
+#if BUILDING_GCC_VERSION >= 4006
+#include "c-family/c-common.h"
+#else
+#include "c-common.h"
+#endif
+
+#if BUILDING_GCC_VERSION <= 4008
+#include "tree-flow.h"
+#else
+#include "tree-cfgcleanup.h"
+#include "tree-ssa-operands.h"
+#include "tree-into-ssa.h"
+#endif
+
+#if BUILDING_GCC_VERSION >= 4008
+#include "is-a.h"
+#endif
+
+#include "diagnostic.h"
+#include "tree-dump.h"
+#include "tree-pass.h"
+#include "predict.h"
+#include "ipa-utils.h"
+
+#if BUILDING_GCC_VERSION >= 4009
+#include "attribs.h"
+#include "varasm.h"
+#include "stor-layout.h"
+#include "internal-fn.h"
+#include "gimple-expr.h"
+#include "gimple-fold.h"
+#include "context.h"
+#include "tree-ssa-alias.h"
+#include "tree-ssa.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "print-tree.h"
+#include "tree-eh.h"
+#include "stmt.h"
+#include "gimplify.h"
+#endif
+
+#include "gimple.h"
+
+#if BUILDING_GCC_VERSION >= 4009
+#include "tree-ssa-operands.h"
+#include "tree-phinodes.h"
+#include "tree-cfg.h"
+#include "gimple-iterator.h"
+#include "gimple-ssa.h"
+#include "ssa-iterators.h"
+#endif
+
+#if BUILDING_GCC_VERSION >= 5000
+#include "builtins.h"
+#endif
+
+/* #include "expr.h" where are you... */
+extern rtx emit_move_insn(rtx x, rtx y);
+
+/* missing from basic_block.h... */
+extern void debug_dominance_info(enum cdi_direction dir);
+extern void debug_dominance_tree(enum cdi_direction dir, basic_block root);
+
+#if BUILDING_GCC_VERSION == 4006
+extern void debug_gimple_stmt(gimple);
+extern void debug_gimple_seq(gimple_seq);
+extern void print_gimple_seq(FILE *, gimple_seq, int, int);
+extern void print_gimple_stmt(FILE *, gimple, int, int);
+extern void print_gimple_expr(FILE *, gimple, int, int);
+extern void dump_gimple_stmt(pretty_printer *, gimple, int, int);
+#endif
+
+#define __unused __attribute__((__unused__))
+
+#define DECL_NAME_POINTER(node) IDENTIFIER_POINTER(DECL_NAME(node))
+#define DECL_NAME_LENGTH(node) IDENTIFIER_LENGTH(DECL_NAME(node))
+#define TYPE_NAME_POINTER(node) IDENTIFIER_POINTER(TYPE_NAME(node))
+#define TYPE_NAME_LENGTH(node) IDENTIFIER_LENGTH(TYPE_NAME(node))
+
+/* should come from c-tree.h if only it were installed for gcc 4.5... */
+#define C_TYPE_FIELDS_READONLY(TYPE) TREE_LANG_FLAG_1(TYPE)
+
+#if BUILDING_GCC_VERSION == 4005
+#define FOR_EACH_LOCAL_DECL(FUN, I, D)                 \
+       for (tree vars = (FUN)->local_decls, (I) = 0;   \
+               vars && ((D) = TREE_VALUE(vars));       \
+               vars = TREE_CHAIN(vars), (I)++)
+#define DECL_CHAIN(NODE) (TREE_CHAIN(DECL_MINIMAL_CHECK(NODE)))
+#define FOR_EACH_VEC_ELT(T, V, I, P) \
+       for (I = 0; VEC_iterate(T, (V), (I), (P)); ++(I))
+#define TODO_rebuild_cgraph_edges 0
+#define SCOPE_FILE_SCOPE_P(EXP) (!(EXP))
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+typedef struct varpool_node *varpool_node_ptr;
+
+static inline bool gimple_call_builtin_p(gimple stmt, enum built_in_function code)
+{
+       tree fndecl;
+
+       if (!is_gimple_call(stmt))
+               return false;
+       fndecl = gimple_call_fndecl(stmt);
+       if (!fndecl || DECL_BUILT_IN_CLASS(fndecl) != BUILT_IN_NORMAL)
+               return false;
+       return DECL_FUNCTION_CODE(fndecl) == code;
+}
+
+static inline bool is_simple_builtin(tree decl)
+{
+       if (decl && DECL_BUILT_IN_CLASS(decl) != BUILT_IN_NORMAL)
+               return false;
+
+       switch (DECL_FUNCTION_CODE(decl)) {
+       /* Builtins that expand to constants. */
+       case BUILT_IN_CONSTANT_P:
+       case BUILT_IN_EXPECT:
+       case BUILT_IN_OBJECT_SIZE:
+       case BUILT_IN_UNREACHABLE:
+       /* Simple register moves or loads from stack. */
+       case BUILT_IN_RETURN_ADDRESS:
+       case BUILT_IN_EXTRACT_RETURN_ADDR:
+       case BUILT_IN_FROB_RETURN_ADDR:
+       case BUILT_IN_RETURN:
+       case BUILT_IN_AGGREGATE_INCOMING_ADDRESS:
+       case BUILT_IN_FRAME_ADDRESS:
+       case BUILT_IN_VA_END:
+       case BUILT_IN_STACK_SAVE:
+       case BUILT_IN_STACK_RESTORE:
+       /* Exception state returns or moves registers around. */
+       case BUILT_IN_EH_FILTER:
+       case BUILT_IN_EH_POINTER:
+       case BUILT_IN_EH_COPY_VALUES:
+       return true;
+
+       default:
+       return false;
+       }
+}
+
+static inline void add_local_decl(struct function *fun, tree d)
+{
+       gcc_assert(TREE_CODE(d) == VAR_DECL);
+       fun->local_decls = tree_cons(NULL_TREE, d, fun->local_decls);
+}
+#endif
+
+#if BUILDING_GCC_VERSION <= 4006
+#define ANY_RETURN_P(rtx) (GET_CODE(rtx) == RETURN)
+#define C_DECL_REGISTER(EXP) DECL_LANG_FLAG_4(EXP)
+#define EDGE_PRESERVE 0ULL
+#define HOST_WIDE_INT_PRINT_HEX_PURE "%" HOST_WIDE_INT_PRINT "x"
+#define flag_fat_lto_objects true
+
+#define get_random_seed(noinit) ({                                             \
+       unsigned HOST_WIDE_INT seed;                                            \
+       sscanf(get_random_seed(noinit), "%" HOST_WIDE_INT_PRINT "x", &seed);    \
+       seed * seed; })
+
+#define int_const_binop(code, arg1, arg2)      \
+       int_const_binop((code), (arg1), (arg2), 0)
+
+static inline bool gimple_clobber_p(gimple s __unused)
+{
+       return false;
+}
+
+static inline bool gimple_asm_clobbers_memory_p(const_gimple stmt)
+{
+       unsigned i;
+
+       for (i = 0; i < gimple_asm_nclobbers(stmt); i++) {
+               tree op = gimple_asm_clobber_op(stmt, i);
+
+               if (!strcmp(TREE_STRING_POINTER(TREE_VALUE(op)), "memory"))
+                       return true;
+       }
+
+       return false;
+}
+
+static inline tree builtin_decl_implicit(enum built_in_function fncode)
+{
+       return implicit_built_in_decls[fncode];
+}
+
+static inline int ipa_reverse_postorder(struct cgraph_node **order)
+{
+       return cgraph_postorder(order);
+}
+
+static inline struct cgraph_node *cgraph_create_node(tree decl)
+{
+       return cgraph_node(decl);
+}
+
+static inline struct cgraph_node *cgraph_get_create_node(tree decl)
+{
+       struct cgraph_node *node = cgraph_get_node(decl);
+
+       return node ? node : cgraph_node(decl);
+}
+
+static inline bool cgraph_function_with_gimple_body_p(struct cgraph_node *node)
+{
+       return node->analyzed && !node->thunk.thunk_p && !node->alias;
+}
+
+static inline struct cgraph_node *cgraph_first_function_with_gimple_body(void)
+{
+       struct cgraph_node *node;
+
+       for (node = cgraph_nodes; node; node = node->next)
+               if (cgraph_function_with_gimple_body_p(node))
+                       return node;
+       return NULL;
+}
+
+static inline struct cgraph_node *cgraph_next_function_with_gimple_body(struct cgraph_node *node)
+{
+       for (node = node->next; node; node = node->next)
+               if (cgraph_function_with_gimple_body_p(node))
+                       return node;
+       return NULL;
+}
+
+#define FOR_EACH_FUNCTION_WITH_GIMPLE_BODY(node) \
+       for ((node) = cgraph_first_function_with_gimple_body(); (node); \
+               (node) = cgraph_next_function_with_gimple_body(node))
+
+static inline void varpool_add_new_variable(tree decl)
+{
+       varpool_finalize_decl(decl);
+}
+#endif
+
+#if BUILDING_GCC_VERSION <= 4007
+#define FOR_EACH_FUNCTION(node)        \
+       for (node = cgraph_nodes; node; node = node->next)
+#define FOR_EACH_VARIABLE(node)        \
+       for (node = varpool_nodes; node; node = node->next)
+#define PROP_loops 0
+#define NODE_SYMBOL(node) (node)
+#define NODE_DECL(node) (node)->decl
+#define INSN_LOCATION(INSN) RTL_LOCATION(INSN)
+#define vNULL NULL
+
+static inline int bb_loop_depth(const_basic_block bb)
+{
+       return bb->loop_father ? loop_depth(bb->loop_father) : 0;
+}
+
+static inline bool gimple_store_p(gimple gs)
+{
+       tree lhs = gimple_get_lhs(gs);
+
+       return lhs && !is_gimple_reg(lhs);
+}
+
+static inline void gimple_init_singleton(gimple g __unused)
+{
+}
+#endif
+
+#if BUILDING_GCC_VERSION == 4007 || BUILDING_GCC_VERSION == 4008
+static inline struct cgraph_node *cgraph_alias_target(struct cgraph_node *n)
+{
+       return cgraph_alias_aliased_node(n);
+}
+#endif
+
+#if BUILDING_GCC_VERSION >= 4007 && BUILDING_GCC_VERSION <= 4009
+#define cgraph_create_edge(caller, callee, call_stmt, count, freq, nest) \
+       cgraph_create_edge((caller), (callee), (call_stmt), (count), (freq))
+#define cgraph_create_edge_including_clones(caller, callee, old_call_stmt, call_stmt, count, freq, nest, reason) \
+       cgraph_create_edge_including_clones((caller), (callee), (old_call_stmt), (call_stmt), (count), (freq), (reason))
+#endif
+
+#if BUILDING_GCC_VERSION <= 4008
+#define ENTRY_BLOCK_PTR_FOR_FN(FN)     ENTRY_BLOCK_PTR_FOR_FUNCTION(FN)
+#define EXIT_BLOCK_PTR_FOR_FN(FN)      EXIT_BLOCK_PTR_FOR_FUNCTION(FN)
+#define basic_block_info_for_fn(FN)    ((FN)->cfg->x_basic_block_info)
+#define n_basic_blocks_for_fn(FN)      ((FN)->cfg->x_n_basic_blocks)
+#define n_edges_for_fn(FN)             ((FN)->cfg->x_n_edges)
+#define last_basic_block_for_fn(FN)    ((FN)->cfg->x_last_basic_block)
+#define label_to_block_map_for_fn(FN)  ((FN)->cfg->x_label_to_block_map)
+#define profile_status_for_fn(FN)      ((FN)->cfg->x_profile_status)
+#define BASIC_BLOCK_FOR_FN(FN, N)      BASIC_BLOCK_FOR_FUNCTION((FN), (N))
+#define NODE_IMPLICIT_ALIAS(node)      (node)->same_body_alias
+#define VAR_P(NODE)                    (TREE_CODE(NODE) == VAR_DECL)
+
+static inline bool tree_fits_shwi_p(const_tree t)
+{
+       if (t == NULL_TREE || TREE_CODE(t) != INTEGER_CST)
+               return false;
+
+       if (TREE_INT_CST_HIGH(t) == 0 && (HOST_WIDE_INT)TREE_INT_CST_LOW(t) >= 0)
+               return true;
+
+       if (TREE_INT_CST_HIGH(t) == -1 && (HOST_WIDE_INT)TREE_INT_CST_LOW(t) < 0 && !TYPE_UNSIGNED(TREE_TYPE(t)))
+               return true;
+
+       return false;
+}
+
+static inline bool tree_fits_uhwi_p(const_tree t)
+{
+       if (t == NULL_TREE || TREE_CODE(t) != INTEGER_CST)
+               return false;
+
+       return TREE_INT_CST_HIGH(t) == 0;
+}
+
+static inline HOST_WIDE_INT tree_to_shwi(const_tree t)
+{
+       gcc_assert(tree_fits_shwi_p(t));
+       return TREE_INT_CST_LOW(t);
+}
+
+static inline unsigned HOST_WIDE_INT tree_to_uhwi(const_tree t)
+{
+       gcc_assert(tree_fits_uhwi_p(t));
+       return TREE_INT_CST_LOW(t);
+}
+
+static inline const char *get_tree_code_name(enum tree_code code)
+{
+       gcc_assert(code < MAX_TREE_CODES);
+       return tree_code_name[code];
+}
+
+#define ipa_remove_stmt_references(cnode, stmt)
+
+typedef union gimple_statement_d gasm;
+typedef union gimple_statement_d gassign;
+typedef union gimple_statement_d gcall;
+typedef union gimple_statement_d gcond;
+typedef union gimple_statement_d gdebug;
+typedef union gimple_statement_d gphi;
+typedef union gimple_statement_d greturn;
+
+static inline gasm *as_a_gasm(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gasm *as_a_const_gasm(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline gassign *as_a_gassign(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gassign *as_a_const_gassign(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline gcall *as_a_gcall(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gcall *as_a_const_gcall(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline gcond *as_a_gcond(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gcond *as_a_const_gcond(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline gdebug *as_a_gdebug(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gdebug *as_a_const_gdebug(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline gphi *as_a_gphi(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gphi *as_a_const_gphi(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline greturn *as_a_greturn(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const greturn *as_a_const_greturn(const_gimple stmt)
+{
+       return stmt;
+}
+#endif
+
+#if BUILDING_GCC_VERSION == 4008
+#define NODE_SYMBOL(node) (&(node)->symbol)
+#define NODE_DECL(node) (node)->symbol.decl
+#endif
+
+#if BUILDING_GCC_VERSION >= 4008
+#define add_referenced_var(var)
+#define mark_sym_for_renaming(var)
+#define varpool_mark_needed_node(node)
+#define create_var_ann(var)
+#define TODO_dump_func 0
+#define TODO_dump_cgraph 0
+#endif
+
+#if BUILDING_GCC_VERSION <= 4009
+#define TODO_verify_il 0
+#define AVAIL_INTERPOSABLE AVAIL_OVERWRITABLE
+
+#define section_name_prefix LTO_SECTION_NAME_PREFIX
+#define fatal_error(loc, gmsgid, ...) fatal_error((gmsgid), __VA_ARGS__)
+
+typedef struct rtx_def rtx_insn;
+
+static inline void set_decl_section_name(tree node, const char *value)
+{
+       if (value)
+               DECL_SECTION_NAME(node) = build_string(strlen(value) + 1, value);
+       else
+               DECL_SECTION_NAME(node) = NULL;
+}
+#endif
+
+#if BUILDING_GCC_VERSION == 4009
+typedef struct gimple_statement_asm gasm;
+typedef struct gimple_statement_base gassign;
+typedef struct gimple_statement_call gcall;
+typedef struct gimple_statement_base gcond;
+typedef struct gimple_statement_base gdebug;
+typedef struct gimple_statement_phi gphi;
+typedef struct gimple_statement_base greturn;
+
+static inline gasm *as_a_gasm(gimple stmt)
+{
+       return as_a<gasm>(stmt);
+}
+
+static inline const gasm *as_a_const_gasm(const_gimple stmt)
+{
+       return as_a<const gasm>(stmt);
+}
+
+static inline gassign *as_a_gassign(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gassign *as_a_const_gassign(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline gcall *as_a_gcall(gimple stmt)
+{
+       return as_a<gcall>(stmt);
+}
+
+static inline const gcall *as_a_const_gcall(const_gimple stmt)
+{
+       return as_a<const gcall>(stmt);
+}
+
+static inline gcond *as_a_gcond(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gcond *as_a_const_gcond(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline gdebug *as_a_gdebug(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const gdebug *as_a_const_gdebug(const_gimple stmt)
+{
+       return stmt;
+}
+
+static inline gphi *as_a_gphi(gimple stmt)
+{
+       return as_a<gphi>(stmt);
+}
+
+static inline const gphi *as_a_const_gphi(const_gimple stmt)
+{
+       return as_a<const gphi>(stmt);
+}
+
+static inline greturn *as_a_greturn(gimple stmt)
+{
+       return stmt;
+}
+
+static inline const greturn *as_a_const_greturn(const_gimple stmt)
+{
+       return stmt;
+}
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+#define TODO_ggc_collect 0
+#define NODE_SYMBOL(node) (node)
+#define NODE_DECL(node) (node)->decl
+#define cgraph_node_name(node) (node)->name()
+#define NODE_IMPLICIT_ALIAS(node) (node)->cpp_implicit_alias
+#endif
+
+#if BUILDING_GCC_VERSION >= 5000 && BUILDING_GCC_VERSION < 6000
+/* gimple related */
+template <>
+template <>
+inline bool is_a_helper<const gassign *>::test(const_gimple gs)
+{
+       return gs->code == GIMPLE_ASSIGN;
+}
+#endif
+
+#if BUILDING_GCC_VERSION >= 5000
+#define TODO_verify_ssa TODO_verify_il
+#define TODO_verify_flow TODO_verify_il
+#define TODO_verify_stmts TODO_verify_il
+#define TODO_verify_rtl_sharing TODO_verify_il
+
+#define INSN_DELETED_P(insn) (insn)->deleted()
+
+/* symtab/cgraph related */
+#define debug_cgraph_node(node) (node)->debug()
+#define cgraph_get_node(decl) cgraph_node::get(decl)
+#define cgraph_get_create_node(decl) cgraph_node::get_create(decl)
+#define cgraph_create_node(decl) cgraph_node::create(decl)
+#define cgraph_n_nodes symtab->cgraph_count
+#define cgraph_max_uid symtab->cgraph_max_uid
+#define varpool_get_node(decl) varpool_node::get(decl)
+
+#define cgraph_create_edge(caller, callee, call_stmt, count, freq, nest) \
+       (caller)->create_edge((callee), (call_stmt), (count), (freq))
+#define cgraph_create_edge_including_clones(caller, callee, old_call_stmt, call_stmt, count, freq, nest, reason) \
+       (caller)->create_edge_including_clones((callee), (old_call_stmt), (call_stmt), (count), (freq), (reason))
+
+typedef struct cgraph_node *cgraph_node_ptr;
+typedef struct cgraph_edge *cgraph_edge_p;
+typedef struct varpool_node *varpool_node_ptr;
+
+static inline void change_decl_assembler_name(tree decl, tree name)
+{
+       symtab->change_decl_assembler_name(decl, name);
+}
+
+static inline void varpool_finalize_decl(tree decl)
+{
+       varpool_node::finalize_decl(decl);
+}
+
+static inline void varpool_add_new_variable(tree decl)
+{
+       varpool_node::add(decl);
+}
+
+static inline unsigned int rebuild_cgraph_edges(void)
+{
+       return cgraph_edge::rebuild_edges();
+}
+
+static inline cgraph_node_ptr cgraph_function_node(cgraph_node_ptr node, enum availability *availability)
+{
+       return node->function_symbol(availability);
+}
+
+static inline cgraph_node_ptr cgraph_function_or_thunk_node(cgraph_node_ptr node, enum availability *availability = NULL)
+{
+       return node->ultimate_alias_target(availability);
+}
+
+static inline bool cgraph_only_called_directly_p(cgraph_node_ptr node)
+{
+       return node->only_called_directly_p();
+}
+
+static inline enum availability cgraph_function_body_availability(cgraph_node_ptr node)
+{
+       return node->get_availability();
+}
+
+static inline cgraph_node_ptr cgraph_alias_target(cgraph_node_ptr node)
+{
+       return node->get_alias_target();
+}
+
+static inline struct cgraph_node_hook_list *cgraph_add_function_insertion_hook(cgraph_node_hook hook, void *data)
+{
+       return symtab->add_cgraph_insertion_hook(hook, data);
+}
+
+static inline void cgraph_remove_function_insertion_hook(struct cgraph_node_hook_list *entry)
+{
+       symtab->remove_cgraph_insertion_hook(entry);
+}
+
+static inline struct cgraph_node_hook_list *cgraph_add_node_removal_hook(cgraph_node_hook hook, void *data)
+{
+       return symtab->add_cgraph_removal_hook(hook, data);
+}
+
+static inline void cgraph_remove_node_removal_hook(struct cgraph_node_hook_list *entry)
+{
+       symtab->remove_cgraph_removal_hook(entry);
+}
+
+static inline struct cgraph_2node_hook_list *cgraph_add_node_duplication_hook(cgraph_2node_hook hook, void *data)
+{
+       return symtab->add_cgraph_duplication_hook(hook, data);
+}
+
+static inline void cgraph_remove_node_duplication_hook(struct cgraph_2node_hook_list *entry)
+{
+       symtab->remove_cgraph_duplication_hook(entry);
+}
+
+static inline void cgraph_call_node_duplication_hooks(cgraph_node_ptr node, cgraph_node_ptr node2)
+{
+       symtab->call_cgraph_duplication_hooks(node, node2);
+}
+
+static inline void cgraph_call_edge_duplication_hooks(cgraph_edge *cs1, cgraph_edge *cs2)
+{
+       symtab->call_edge_duplication_hooks(cs1, cs2);
+}
+
+#if BUILDING_GCC_VERSION >= 6000
+typedef gimple *gimple_ptr;
+typedef const gimple *const_gimple_ptr;
+#define gimple gimple_ptr
+#define const_gimple const_gimple_ptr
+#undef CONST_CAST_GIMPLE
+#define CONST_CAST_GIMPLE(X) CONST_CAST(gimple, (X))
+#endif
+
+/* gimple related */
+static inline gimple gimple_build_assign_with_ops(enum tree_code subcode, tree lhs, tree op1, tree op2 MEM_STAT_DECL)
+{
+       return gimple_build_assign(lhs, subcode, op1, op2 PASS_MEM_STAT);
+}
+
+template <>
+template <>
+inline bool is_a_helper<const greturn *>::test(const_gimple gs)
+{
+       return gs->code == GIMPLE_RETURN;
+}
+
+static inline gasm *as_a_gasm(gimple stmt)
+{
+       return as_a<gasm *>(stmt);
+}
+
+static inline const gasm *as_a_const_gasm(const_gimple stmt)
+{
+       return as_a<const gasm *>(stmt);
+}
+
+static inline gassign *as_a_gassign(gimple stmt)
+{
+       return as_a<gassign *>(stmt);
+}
+
+static inline const gassign *as_a_const_gassign(const_gimple stmt)
+{
+       return as_a<const gassign *>(stmt);
+}
+
+static inline gcall *as_a_gcall(gimple stmt)
+{
+       return as_a<gcall *>(stmt);
+}
+
+static inline const gcall *as_a_const_gcall(const_gimple stmt)
+{
+       return as_a<const gcall *>(stmt);
+}
+
+static inline gphi *as_a_gphi(gimple stmt)
+{
+       return as_a<gphi *>(stmt);
+}
+
+static inline const gphi *as_a_const_gphi(const_gimple stmt)
+{
+       return as_a<const gphi *>(stmt);
+}
+
+static inline greturn *as_a_greturn(gimple stmt)
+{
+       return as_a<greturn *>(stmt);
+}
+
+static inline const greturn *as_a_const_greturn(const_gimple stmt)
+{
+       return as_a<const greturn *>(stmt);
+}
+
+/* IPA/LTO related */
+#define ipa_ref_list_referring_iterate(L, I, P)        \
+       (L)->referring.iterate((I), &(P))
+#define ipa_ref_list_reference_iterate(L, I, P)        \
+       (L)->reference.iterate((I), &(P))
+
+static inline cgraph_node_ptr ipa_ref_referring_node(struct ipa_ref *ref)
+{
+       return dyn_cast<cgraph_node_ptr>(ref->referring);
+}
+
+static inline void ipa_remove_stmt_references(symtab_node *referring_node, gimple stmt)
+{
+       referring_node->remove_stmt_references(stmt);
+}
+#endif
+
+#if BUILDING_GCC_VERSION < 6000
+#define get_inner_reference(exp, pbitsize, pbitpos, poffset, pmode, punsignedp, preversep, pvolatilep, keep_aligning)  \
+       get_inner_reference(exp, pbitsize, pbitpos, poffset, pmode, punsignedp, pvolatilep, keep_aligning)
+#define gen_rtx_set(ARG0, ARG1) gen_rtx_SET(VOIDmode, (ARG0), (ARG1))
+#endif
+
+#if BUILDING_GCC_VERSION >= 6000
+#define gen_rtx_set(ARG0, ARG1) gen_rtx_SET((ARG0), (ARG1))
+#endif
+
+#ifdef __cplusplus
+static inline void debug_tree(const_tree t)
+{
+       debug_tree(CONST_CAST_TREE(t));
+}
+
+static inline void debug_gimple_stmt(const_gimple s)
+{
+       debug_gimple_stmt(CONST_CAST_GIMPLE(s));
+}
+#else
+#define debug_tree(t) debug_tree(CONST_CAST_TREE(t))
+#define debug_gimple_stmt(s) debug_gimple_stmt(CONST_CAST_GIMPLE(s))
+#endif
+
+#endif
diff --git a/scripts/gcc-plugins/gcc-generate-gimple-pass.h b/scripts/gcc-plugins/gcc-generate-gimple-pass.h
new file mode 100644 (file)
index 0000000..526c3c7
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * Generator for GIMPLE pass related boilerplate code/data
+ *
+ * Supports gcc 4.5-6
+ *
+ * Usage:
+ *
+ * 1. before inclusion define PASS_NAME
+ * 2. before inclusion define NO_* for unimplemented callbacks
+ *    NO_GATE
+ *    NO_EXECUTE
+ * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override
+ *    the default 0 values
+ * 4. for convenience, all the above will be undefined after inclusion!
+ * 5. the only exported name is make_PASS_NAME_pass() to register with gcc
+ */
+
+#ifndef PASS_NAME
+#error at least PASS_NAME must be defined
+#else
+#define __GCC_PLUGIN_STRINGIFY(n)      #n
+#define _GCC_PLUGIN_STRINGIFY(n)       __GCC_PLUGIN_STRINGIFY(n)
+#define _GCC_PLUGIN_CONCAT2(x, y)      x ## y
+#define _GCC_PLUGIN_CONCAT3(x, y, z)   x ## y ## z
+
+#define __PASS_NAME_PASS_DATA(n)       _GCC_PLUGIN_CONCAT2(n, _pass_data)
+#define _PASS_NAME_PASS_DATA           __PASS_NAME_PASS_DATA(PASS_NAME)
+
+#define __PASS_NAME_PASS(n)            _GCC_PLUGIN_CONCAT2(n, _pass)
+#define _PASS_NAME_PASS                        __PASS_NAME_PASS(PASS_NAME)
+
+#define _PASS_NAME_NAME                        _GCC_PLUGIN_STRINGIFY(PASS_NAME)
+
+#define __MAKE_PASS_NAME_PASS(n)       _GCC_PLUGIN_CONCAT3(make_, n, _pass)
+#define _MAKE_PASS_NAME_PASS           __MAKE_PASS_NAME_PASS(PASS_NAME)
+
+#ifdef NO_GATE
+#define _GATE NULL
+#define _HAS_GATE false
+#else
+#define __GATE(n)                      _GCC_PLUGIN_CONCAT2(n, _gate)
+#define _GATE                          __GATE(PASS_NAME)
+#define _HAS_GATE true
+#endif
+
+#ifdef NO_EXECUTE
+#define _EXECUTE NULL
+#define _HAS_EXECUTE false
+#else
+#define __EXECUTE(n)                   _GCC_PLUGIN_CONCAT2(n, _execute)
+#define _EXECUTE                       __EXECUTE(PASS_NAME)
+#define _HAS_EXECUTE true
+#endif
+
+#ifndef PROPERTIES_REQUIRED
+#define PROPERTIES_REQUIRED 0
+#endif
+
+#ifndef PROPERTIES_PROVIDED
+#define PROPERTIES_PROVIDED 0
+#endif
+
+#ifndef PROPERTIES_DESTROYED
+#define PROPERTIES_DESTROYED 0
+#endif
+
+#ifndef TODO_FLAGS_START
+#define TODO_FLAGS_START 0
+#endif
+
+#ifndef TODO_FLAGS_FINISH
+#define TODO_FLAGS_FINISH 0
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+namespace {
+static const pass_data _PASS_NAME_PASS_DATA = {
+#else
+static struct gimple_opt_pass _PASS_NAME_PASS = {
+       .pass = {
+#endif
+               .type                   = GIMPLE_PASS,
+               .name                   = _PASS_NAME_NAME,
+#if BUILDING_GCC_VERSION >= 4008
+               .optinfo_flags          = OPTGROUP_NONE,
+#endif
+#if BUILDING_GCC_VERSION >= 5000
+#elif BUILDING_GCC_VERSION == 4009
+               .has_gate               = _HAS_GATE,
+               .has_execute            = _HAS_EXECUTE,
+#else
+               .gate                   = _GATE,
+               .execute                = _EXECUTE,
+               .sub                    = NULL,
+               .next                   = NULL,
+               .static_pass_number     = 0,
+#endif
+               .tv_id                  = TV_NONE,
+               .properties_required    = PROPERTIES_REQUIRED,
+               .properties_provided    = PROPERTIES_PROVIDED,
+               .properties_destroyed   = PROPERTIES_DESTROYED,
+               .todo_flags_start       = TODO_FLAGS_START,
+               .todo_flags_finish      = TODO_FLAGS_FINISH,
+#if BUILDING_GCC_VERSION < 4009
+       }
+#endif
+};
+
+#if BUILDING_GCC_VERSION >= 4009
+class _PASS_NAME_PASS : public gimple_opt_pass {
+public:
+       _PASS_NAME_PASS() : gimple_opt_pass(_PASS_NAME_PASS_DATA, g) {}
+
+#ifndef NO_GATE
+#if BUILDING_GCC_VERSION >= 5000
+       virtual bool gate(function *) { return _GATE(); }
+#else
+       virtual bool gate(void) { return _GATE(); }
+#endif
+#endif
+
+       virtual opt_pass * clone () { return new _PASS_NAME_PASS(); }
+
+#ifndef NO_EXECUTE
+#if BUILDING_GCC_VERSION >= 5000
+       virtual unsigned int execute(function *) { return _EXECUTE(); }
+#else
+       virtual unsigned int execute(void) { return _EXECUTE(); }
+#endif
+#endif
+};
+}
+
+opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+       return new _PASS_NAME_PASS();
+}
+#else
+struct opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+       return &_PASS_NAME_PASS.pass;
+}
+#endif
+
+/* clean up user provided defines */
+#undef PASS_NAME
+#undef NO_GATE
+#undef NO_EXECUTE
+
+#undef PROPERTIES_DESTROYED
+#undef PROPERTIES_PROVIDED
+#undef PROPERTIES_REQUIRED
+#undef TODO_FLAGS_FINISH
+#undef TODO_FLAGS_START
+
+/* clean up generated defines */
+#undef _EXECUTE
+#undef __EXECUTE
+#undef _GATE
+#undef __GATE
+#undef _GCC_PLUGIN_CONCAT2
+#undef _GCC_PLUGIN_CONCAT3
+#undef _GCC_PLUGIN_STRINGIFY
+#undef __GCC_PLUGIN_STRINGIFY
+#undef _HAS_EXECUTE
+#undef _HAS_GATE
+#undef _MAKE_PASS_NAME_PASS
+#undef __MAKE_PASS_NAME_PASS
+#undef _PASS_NAME_NAME
+#undef _PASS_NAME_PASS
+#undef __PASS_NAME_PASS
+#undef _PASS_NAME_PASS_DATA
+#undef __PASS_NAME_PASS_DATA
+
+#endif /* PASS_NAME */
diff --git a/scripts/gcc-plugins/gcc-generate-ipa-pass.h b/scripts/gcc-plugins/gcc-generate-ipa-pass.h
new file mode 100644 (file)
index 0000000..9bd926e
--- /dev/null
@@ -0,0 +1,289 @@
+/*
+ * Generator for IPA pass related boilerplate code/data
+ *
+ * Supports gcc 4.5-6
+ *
+ * Usage:
+ *
+ * 1. before inclusion define PASS_NAME
+ * 2. before inclusion define NO_* for unimplemented callbacks
+ *    NO_GENERATE_SUMMARY
+ *    NO_READ_SUMMARY
+ *    NO_WRITE_SUMMARY
+ *    NO_READ_OPTIMIZATION_SUMMARY
+ *    NO_WRITE_OPTIMIZATION_SUMMARY
+ *    NO_STMT_FIXUP
+ *    NO_FUNCTION_TRANSFORM
+ *    NO_VARIABLE_TRANSFORM
+ *    NO_GATE
+ *    NO_EXECUTE
+ * 3. before inclusion define PROPERTIES_* and *TODO_FLAGS_* to override
+ *    the default 0 values
+ * 4. for convenience, all the above will be undefined after inclusion!
+ * 5. the only exported name is make_PASS_NAME_pass() to register with gcc
+ */
+
+#ifndef PASS_NAME
+#error at least PASS_NAME must be defined
+#else
+#define __GCC_PLUGIN_STRINGIFY(n)      #n
+#define _GCC_PLUGIN_STRINGIFY(n)       __GCC_PLUGIN_STRINGIFY(n)
+#define _GCC_PLUGIN_CONCAT2(x, y)      x ## y
+#define _GCC_PLUGIN_CONCAT3(x, y, z)   x ## y ## z
+
+#define __PASS_NAME_PASS_DATA(n)       _GCC_PLUGIN_CONCAT2(n, _pass_data)
+#define _PASS_NAME_PASS_DATA           __PASS_NAME_PASS_DATA(PASS_NAME)
+
+#define __PASS_NAME_PASS(n)            _GCC_PLUGIN_CONCAT2(n, _pass)
+#define _PASS_NAME_PASS                        __PASS_NAME_PASS(PASS_NAME)
+
+#define _PASS_NAME_NAME                        _GCC_PLUGIN_STRINGIFY(PASS_NAME)
+
+#define __MAKE_PASS_NAME_PASS(n)       _GCC_PLUGIN_CONCAT3(make_, n, _pass)
+#define _MAKE_PASS_NAME_PASS           __MAKE_PASS_NAME_PASS(PASS_NAME)
+
+#ifdef NO_GENERATE_SUMMARY
+#define _GENERATE_SUMMARY NULL
+#else
+#define __GENERATE_SUMMARY(n)          _GCC_PLUGIN_CONCAT2(n, _generate_summary)
+#define _GENERATE_SUMMARY              __GENERATE_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_READ_SUMMARY
+#define _READ_SUMMARY NULL
+#else
+#define __READ_SUMMARY(n)              _GCC_PLUGIN_CONCAT2(n, _read_summary)
+#define _READ_SUMMARY                  __READ_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_WRITE_SUMMARY
+#define _WRITE_SUMMARY NULL
+#else
+#define __WRITE_SUMMARY(n)             _GCC_PLUGIN_CONCAT2(n, _write_summary)
+#define _WRITE_SUMMARY                 __WRITE_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_READ_OPTIMIZATION_SUMMARY
+#define _READ_OPTIMIZATION_SUMMARY NULL
+#else
+#define __READ_OPTIMIZATION_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _read_optimization_summary)
+#define _READ_OPTIMIZATION_SUMMARY     __READ_OPTIMIZATION_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_WRITE_OPTIMIZATION_SUMMARY
+#define _WRITE_OPTIMIZATION_SUMMARY NULL
+#else
+#define __WRITE_OPTIMIZATION_SUMMARY(n)        _GCC_PLUGIN_CONCAT2(n, _write_optimization_summary)
+#define _WRITE_OPTIMIZATION_SUMMARY    __WRITE_OPTIMIZATION_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_STMT_FIXUP
+#define _STMT_FIXUP NULL
+#else
+#define __STMT_FIXUP(n)                        _GCC_PLUGIN_CONCAT2(n, _stmt_fixup)
+#define _STMT_FIXUP                    __STMT_FIXUP(PASS_NAME)
+#endif
+
+#ifdef NO_FUNCTION_TRANSFORM
+#define _FUNCTION_TRANSFORM NULL
+#else
+#define __FUNCTION_TRANSFORM(n)                _GCC_PLUGIN_CONCAT2(n, _function_transform)
+#define _FUNCTION_TRANSFORM            __FUNCTION_TRANSFORM(PASS_NAME)
+#endif
+
+#ifdef NO_VARIABLE_TRANSFORM
+#define _VARIABLE_TRANSFORM NULL
+#else
+#define __VARIABLE_TRANSFORM(n)                _GCC_PLUGIN_CONCAT2(n, _variable_transform)
+#define _VARIABLE_TRANSFORM            __VARIABLE_TRANSFORM(PASS_NAME)
+#endif
+
+#ifdef NO_GATE
+#define _GATE NULL
+#define _HAS_GATE false
+#else
+#define __GATE(n)                      _GCC_PLUGIN_CONCAT2(n, _gate)
+#define _GATE                          __GATE(PASS_NAME)
+#define _HAS_GATE true
+#endif
+
+#ifdef NO_EXECUTE
+#define _EXECUTE NULL
+#define _HAS_EXECUTE false
+#else
+#define __EXECUTE(n)                   _GCC_PLUGIN_CONCAT2(n, _execute)
+#define _EXECUTE                       __EXECUTE(PASS_NAME)
+#define _HAS_EXECUTE true
+#endif
+
+#ifndef PROPERTIES_REQUIRED
+#define PROPERTIES_REQUIRED 0
+#endif
+
+#ifndef PROPERTIES_PROVIDED
+#define PROPERTIES_PROVIDED 0
+#endif
+
+#ifndef PROPERTIES_DESTROYED
+#define PROPERTIES_DESTROYED 0
+#endif
+
+#ifndef TODO_FLAGS_START
+#define TODO_FLAGS_START 0
+#endif
+
+#ifndef TODO_FLAGS_FINISH
+#define TODO_FLAGS_FINISH 0
+#endif
+
+#ifndef FUNCTION_TRANSFORM_TODO_FLAGS_START
+#define FUNCTION_TRANSFORM_TODO_FLAGS_START 0
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+namespace {
+static const pass_data _PASS_NAME_PASS_DATA = {
+#else
+static struct ipa_opt_pass_d _PASS_NAME_PASS = {
+       .pass = {
+#endif
+               .type                   = IPA_PASS,
+               .name                   = _PASS_NAME_NAME,
+#if BUILDING_GCC_VERSION >= 4008
+               .optinfo_flags          = OPTGROUP_NONE,
+#endif
+#if BUILDING_GCC_VERSION >= 5000
+#elif BUILDING_GCC_VERSION == 4009
+               .has_gate               = _HAS_GATE,
+               .has_execute            = _HAS_EXECUTE,
+#else
+               .gate                   = _GATE,
+               .execute                = _EXECUTE,
+               .sub                    = NULL,
+               .next                   = NULL,
+               .static_pass_number     = 0,
+#endif
+               .tv_id                  = TV_NONE,
+               .properties_required    = PROPERTIES_REQUIRED,
+               .properties_provided    = PROPERTIES_PROVIDED,
+               .properties_destroyed   = PROPERTIES_DESTROYED,
+               .todo_flags_start       = TODO_FLAGS_START,
+               .todo_flags_finish      = TODO_FLAGS_FINISH,
+#if BUILDING_GCC_VERSION < 4009
+       },
+       .generate_summary               = _GENERATE_SUMMARY,
+       .write_summary                  = _WRITE_SUMMARY,
+       .read_summary                   = _READ_SUMMARY,
+#if BUILDING_GCC_VERSION >= 4006
+       .write_optimization_summary     = _WRITE_OPTIMIZATION_SUMMARY,
+       .read_optimization_summary      = _READ_OPTIMIZATION_SUMMARY,
+#endif
+       .stmt_fixup                     = _STMT_FIXUP,
+       .function_transform_todo_flags_start    = FUNCTION_TRANSFORM_TODO_FLAGS_START,
+       .function_transform             = _FUNCTION_TRANSFORM,
+       .variable_transform             = _VARIABLE_TRANSFORM,
+#endif
+};
+
+#if BUILDING_GCC_VERSION >= 4009
+class _PASS_NAME_PASS : public ipa_opt_pass_d {
+public:
+       _PASS_NAME_PASS() : ipa_opt_pass_d(_PASS_NAME_PASS_DATA,
+                        g,
+                        _GENERATE_SUMMARY,
+                        _WRITE_SUMMARY,
+                        _READ_SUMMARY,
+                        _WRITE_OPTIMIZATION_SUMMARY,
+                        _READ_OPTIMIZATION_SUMMARY,
+                        _STMT_FIXUP,
+                        FUNCTION_TRANSFORM_TODO_FLAGS_START,
+                        _FUNCTION_TRANSFORM,
+                        _VARIABLE_TRANSFORM) {}
+
+#ifndef NO_GATE
+#if BUILDING_GCC_VERSION >= 5000
+       virtual bool gate(function *) { return _GATE(); }
+#else
+       virtual bool gate(void) { return _GATE(); }
+#endif
+#endif
+
+       virtual opt_pass *clone() { return new _PASS_NAME_PASS(); }
+
+#ifndef NO_EXECUTE
+#if BUILDING_GCC_VERSION >= 5000
+       virtual unsigned int execute(function *) { return _EXECUTE(); }
+#else
+       virtual unsigned int execute(void) { return _EXECUTE(); }
+#endif
+#endif
+};
+}
+
+opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+       return new _PASS_NAME_PASS();
+}
+#else
+struct opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+       return &_PASS_NAME_PASS.pass;
+}
+#endif
+
+/* clean up user provided defines */
+#undef PASS_NAME
+#undef NO_GENERATE_SUMMARY
+#undef NO_WRITE_SUMMARY
+#undef NO_READ_SUMMARY
+#undef NO_WRITE_OPTIMIZATION_SUMMARY
+#undef NO_READ_OPTIMIZATION_SUMMARY
+#undef NO_STMT_FIXUP
+#undef NO_FUNCTION_TRANSFORM
+#undef NO_VARIABLE_TRANSFORM
+#undef NO_GATE
+#undef NO_EXECUTE
+
+#undef FUNCTION_TRANSFORM_TODO_FLAGS_START
+#undef PROPERTIES_DESTROYED
+#undef PROPERTIES_PROVIDED
+#undef PROPERTIES_REQUIRED
+#undef TODO_FLAGS_FINISH
+#undef TODO_FLAGS_START
+
+/* clean up generated defines */
+#undef _EXECUTE
+#undef __EXECUTE
+#undef _FUNCTION_TRANSFORM
+#undef __FUNCTION_TRANSFORM
+#undef _GATE
+#undef __GATE
+#undef _GCC_PLUGIN_CONCAT2
+#undef _GCC_PLUGIN_CONCAT3
+#undef _GCC_PLUGIN_STRINGIFY
+#undef __GCC_PLUGIN_STRINGIFY
+#undef _GENERATE_SUMMARY
+#undef __GENERATE_SUMMARY
+#undef _HAS_EXECUTE
+#undef _HAS_GATE
+#undef _MAKE_PASS_NAME_PASS
+#undef __MAKE_PASS_NAME_PASS
+#undef _PASS_NAME_NAME
+#undef _PASS_NAME_PASS
+#undef __PASS_NAME_PASS
+#undef _PASS_NAME_PASS_DATA
+#undef __PASS_NAME_PASS_DATA
+#undef _READ_OPTIMIZATION_SUMMARY
+#undef __READ_OPTIMIZATION_SUMMARY
+#undef _READ_SUMMARY
+#undef __READ_SUMMARY
+#undef _STMT_FIXUP
+#undef __STMT_FIXUP
+#undef _VARIABLE_TRANSFORM
+#undef __VARIABLE_TRANSFORM
+#undef _WRITE_OPTIMIZATION_SUMMARY
+#undef __WRITE_OPTIMIZATION_SUMMARY
+#undef _WRITE_SUMMARY
+#undef __WRITE_SUMMARY
+
+#endif /* PASS_NAME */
diff --git a/scripts/gcc-plugins/gcc-generate-rtl-pass.h b/scripts/gcc-plugins/gcc-generate-rtl-pass.h
new file mode 100644 (file)
index 0000000..1dc67a5
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * Generator for RTL pass related boilerplate code/data
+ *
+ * Supports gcc 4.5-6
+ *
+ * Usage:
+ *
+ * 1. before inclusion define PASS_NAME
+ * 2. before inclusion define NO_* for unimplemented callbacks
+ *    NO_GATE
+ *    NO_EXECUTE
+ * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override
+ *    the default 0 values
+ * 4. for convenience, all the above will be undefined after inclusion!
+ * 5. the only exported name is make_PASS_NAME_pass() to register with gcc
+ */
+
+#ifndef PASS_NAME
+#error at least PASS_NAME must be defined
+#else
+#define __GCC_PLUGIN_STRINGIFY(n)      #n
+#define _GCC_PLUGIN_STRINGIFY(n)       __GCC_PLUGIN_STRINGIFY(n)
+#define _GCC_PLUGIN_CONCAT2(x, y)      x ## y
+#define _GCC_PLUGIN_CONCAT3(x, y, z)   x ## y ## z
+
+#define __PASS_NAME_PASS_DATA(n)       _GCC_PLUGIN_CONCAT2(n, _pass_data)
+#define _PASS_NAME_PASS_DATA           __PASS_NAME_PASS_DATA(PASS_NAME)
+
+#define __PASS_NAME_PASS(n)            _GCC_PLUGIN_CONCAT2(n, _pass)
+#define _PASS_NAME_PASS                        __PASS_NAME_PASS(PASS_NAME)
+
+#define _PASS_NAME_NAME                        _GCC_PLUGIN_STRINGIFY(PASS_NAME)
+
+#define __MAKE_PASS_NAME_PASS(n)       _GCC_PLUGIN_CONCAT3(make_, n, _pass)
+#define _MAKE_PASS_NAME_PASS           __MAKE_PASS_NAME_PASS(PASS_NAME)
+
+#ifdef NO_GATE
+#define _GATE NULL
+#define _HAS_GATE false
+#else
+#define __GATE(n)                      _GCC_PLUGIN_CONCAT2(n, _gate)
+#define _GATE                          __GATE(PASS_NAME)
+#define _HAS_GATE true
+#endif
+
+#ifdef NO_EXECUTE
+#define _EXECUTE NULL
+#define _HAS_EXECUTE false
+#else
+#define __EXECUTE(n)                   _GCC_PLUGIN_CONCAT2(n, _execute)
+#define _EXECUTE                       __EXECUTE(PASS_NAME)
+#define _HAS_EXECUTE true
+#endif
+
+#ifndef PROPERTIES_REQUIRED
+#define PROPERTIES_REQUIRED 0
+#endif
+
+#ifndef PROPERTIES_PROVIDED
+#define PROPERTIES_PROVIDED 0
+#endif
+
+#ifndef PROPERTIES_DESTROYED
+#define PROPERTIES_DESTROYED 0
+#endif
+
+#ifndef TODO_FLAGS_START
+#define TODO_FLAGS_START 0
+#endif
+
+#ifndef TODO_FLAGS_FINISH
+#define TODO_FLAGS_FINISH 0
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+namespace {
+static const pass_data _PASS_NAME_PASS_DATA = {
+#else
+static struct rtl_opt_pass _PASS_NAME_PASS = {
+       .pass = {
+#endif
+               .type                   = RTL_PASS,
+               .name                   = _PASS_NAME_NAME,
+#if BUILDING_GCC_VERSION >= 4008
+               .optinfo_flags          = OPTGROUP_NONE,
+#endif
+#if BUILDING_GCC_VERSION >= 5000
+#elif BUILDING_GCC_VERSION == 4009
+               .has_gate               = _HAS_GATE,
+               .has_execute            = _HAS_EXECUTE,
+#else
+               .gate                   = _GATE,
+               .execute                = _EXECUTE,
+               .sub                    = NULL,
+               .next                   = NULL,
+               .static_pass_number     = 0,
+#endif
+               .tv_id                  = TV_NONE,
+               .properties_required    = PROPERTIES_REQUIRED,
+               .properties_provided    = PROPERTIES_PROVIDED,
+               .properties_destroyed   = PROPERTIES_DESTROYED,
+               .todo_flags_start       = TODO_FLAGS_START,
+               .todo_flags_finish      = TODO_FLAGS_FINISH,
+#if BUILDING_GCC_VERSION < 4009
+       }
+#endif
+};
+
+#if BUILDING_GCC_VERSION >= 4009
+class _PASS_NAME_PASS : public rtl_opt_pass {
+public:
+       _PASS_NAME_PASS() : rtl_opt_pass(_PASS_NAME_PASS_DATA, g) {}
+
+#ifndef NO_GATE
+#if BUILDING_GCC_VERSION >= 5000
+       virtual bool gate(function *) { return _GATE(); }
+#else
+       virtual bool gate(void) { return _GATE(); }
+#endif
+#endif
+
+       virtual opt_pass *clone() { return new _PASS_NAME_PASS(); }
+
+#ifndef NO_EXECUTE
+#if BUILDING_GCC_VERSION >= 5000
+       virtual unsigned int execute(function *) { return _EXECUTE(); }
+#else
+       virtual unsigned int execute(void) { return _EXECUTE(); }
+#endif
+#endif
+};
+}
+
+opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+       return new _PASS_NAME_PASS();
+}
+#else
+struct opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+       return &_PASS_NAME_PASS.pass;
+}
+#endif
+
+/* clean up user provided defines */
+#undef PASS_NAME
+#undef NO_GATE
+#undef NO_EXECUTE
+
+#undef PROPERTIES_DESTROYED
+#undef PROPERTIES_PROVIDED
+#undef PROPERTIES_REQUIRED
+#undef TODO_FLAGS_FINISH
+#undef TODO_FLAGS_START
+
+/* clean up generated defines */
+#undef _EXECUTE
+#undef __EXECUTE
+#undef _GATE
+#undef __GATE
+#undef _GCC_PLUGIN_CONCAT2
+#undef _GCC_PLUGIN_CONCAT3
+#undef _GCC_PLUGIN_STRINGIFY
+#undef __GCC_PLUGIN_STRINGIFY
+#undef _HAS_EXECUTE
+#undef _HAS_GATE
+#undef _MAKE_PASS_NAME_PASS
+#undef __MAKE_PASS_NAME_PASS
+#undef _PASS_NAME_NAME
+#undef _PASS_NAME_PASS
+#undef __PASS_NAME_PASS
+#undef _PASS_NAME_PASS_DATA
+#undef __PASS_NAME_PASS_DATA
+
+#endif /* PASS_NAME */
diff --git a/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h b/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h
new file mode 100644 (file)
index 0000000..a27e2b3
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * Generator for SIMPLE_IPA pass related boilerplate code/data
+ *
+ * Supports gcc 4.5-6
+ *
+ * Usage:
+ *
+ * 1. before inclusion define PASS_NAME
+ * 2. before inclusion define NO_* for unimplemented callbacks
+ *    NO_GATE
+ *    NO_EXECUTE
+ * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override
+ *    the default 0 values
+ * 4. for convenience, all the above will be undefined after inclusion!
+ * 5. the only exported name is make_PASS_NAME_pass() to register with gcc
+ */
+
+#ifndef PASS_NAME
+#error at least PASS_NAME must be defined
+#else
+#define __GCC_PLUGIN_STRINGIFY(n)      #n
+#define _GCC_PLUGIN_STRINGIFY(n)       __GCC_PLUGIN_STRINGIFY(n)
+#define _GCC_PLUGIN_CONCAT2(x, y)      x ## y
+#define _GCC_PLUGIN_CONCAT3(x, y, z)   x ## y ## z
+
+#define __PASS_NAME_PASS_DATA(n)       _GCC_PLUGIN_CONCAT2(n, _pass_data)
+#define _PASS_NAME_PASS_DATA           __PASS_NAME_PASS_DATA(PASS_NAME)
+
+#define __PASS_NAME_PASS(n)            _GCC_PLUGIN_CONCAT2(n, _pass)
+#define _PASS_NAME_PASS                        __PASS_NAME_PASS(PASS_NAME)
+
+#define _PASS_NAME_NAME                        _GCC_PLUGIN_STRINGIFY(PASS_NAME)
+
+#define __MAKE_PASS_NAME_PASS(n)       _GCC_PLUGIN_CONCAT3(make_, n, _pass)
+#define _MAKE_PASS_NAME_PASS           __MAKE_PASS_NAME_PASS(PASS_NAME)
+
+#ifdef NO_GATE
+#define _GATE NULL
+#define _HAS_GATE false
+#else
+#define __GATE(n)                      _GCC_PLUGIN_CONCAT2(n, _gate)
+#define _GATE                          __GATE(PASS_NAME)
+#define _HAS_GATE true
+#endif
+
+#ifdef NO_EXECUTE
+#define _EXECUTE NULL
+#define _HAS_EXECUTE false
+#else
+#define __EXECUTE(n)                   _GCC_PLUGIN_CONCAT2(n, _execute)
+#define _EXECUTE                       __EXECUTE(PASS_NAME)
+#define _HAS_EXECUTE true
+#endif
+
+#ifndef PROPERTIES_REQUIRED
+#define PROPERTIES_REQUIRED 0
+#endif
+
+#ifndef PROPERTIES_PROVIDED
+#define PROPERTIES_PROVIDED 0
+#endif
+
+#ifndef PROPERTIES_DESTROYED
+#define PROPERTIES_DESTROYED 0
+#endif
+
+#ifndef TODO_FLAGS_START
+#define TODO_FLAGS_START 0
+#endif
+
+#ifndef TODO_FLAGS_FINISH
+#define TODO_FLAGS_FINISH 0
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+namespace {
+static const pass_data _PASS_NAME_PASS_DATA = {
+#else
+static struct simple_ipa_opt_pass _PASS_NAME_PASS = {
+       .pass = {
+#endif
+               .type                   = SIMPLE_IPA_PASS,
+               .name                   = _PASS_NAME_NAME,
+#if BUILDING_GCC_VERSION >= 4008
+               .optinfo_flags          = OPTGROUP_NONE,
+#endif
+#if BUILDING_GCC_VERSION >= 5000
+#elif BUILDING_GCC_VERSION == 4009
+               .has_gate               = _HAS_GATE,
+               .has_execute            = _HAS_EXECUTE,
+#else
+               .gate                   = _GATE,
+               .execute                = _EXECUTE,
+               .sub                    = NULL,
+               .next                   = NULL,
+               .static_pass_number     = 0,
+#endif
+               .tv_id                  = TV_NONE,
+               .properties_required    = PROPERTIES_REQUIRED,
+               .properties_provided    = PROPERTIES_PROVIDED,
+               .properties_destroyed   = PROPERTIES_DESTROYED,
+               .todo_flags_start       = TODO_FLAGS_START,
+               .todo_flags_finish      = TODO_FLAGS_FINISH,
+#if BUILDING_GCC_VERSION < 4009
+       }
+#endif
+};
+
+#if BUILDING_GCC_VERSION >= 4009
+class _PASS_NAME_PASS : public simple_ipa_opt_pass {
+public:
+       _PASS_NAME_PASS() : simple_ipa_opt_pass(_PASS_NAME_PASS_DATA, g) {}
+
+#ifndef NO_GATE
+#if BUILDING_GCC_VERSION >= 5000
+       virtual bool gate(function *) { return _GATE(); }
+#else
+       virtual bool gate(void) { return _GATE(); }
+#endif
+#endif
+
+       virtual opt_pass *clone() { return new _PASS_NAME_PASS(); }
+
+#ifndef NO_EXECUTE
+#if BUILDING_GCC_VERSION >= 5000
+       virtual unsigned int execute(function *) { return _EXECUTE(); }
+#else
+       virtual unsigned int execute(void) { return _EXECUTE(); }
+#endif
+#endif
+};
+}
+
+opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+       return new _PASS_NAME_PASS();
+}
+#else
+struct opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+       return &_PASS_NAME_PASS.pass;
+}
+#endif
+
+/* clean up user provided defines */
+#undef PASS_NAME
+#undef NO_GATE
+#undef NO_EXECUTE
+
+#undef PROPERTIES_DESTROYED
+#undef PROPERTIES_PROVIDED
+#undef PROPERTIES_REQUIRED
+#undef TODO_FLAGS_FINISH
+#undef TODO_FLAGS_START
+
+/* clean up generated defines */
+#undef _EXECUTE
+#undef __EXECUTE
+#undef _GATE
+#undef __GATE
+#undef _GCC_PLUGIN_CONCAT2
+#undef _GCC_PLUGIN_CONCAT3
+#undef _GCC_PLUGIN_STRINGIFY
+#undef __GCC_PLUGIN_STRINGIFY
+#undef _HAS_EXECUTE
+#undef _HAS_GATE
+#undef _MAKE_PASS_NAME_PASS
+#undef __MAKE_PASS_NAME_PASS
+#undef _PASS_NAME_NAME
+#undef _PASS_NAME_PASS
+#undef __PASS_NAME_PASS
+#undef _PASS_NAME_PASS_DATA
+#undef __PASS_NAME_PASS_DATA
+
+#endif /* PASS_NAME */
diff --git a/scripts/gcc-plugins/sancov_plugin.c b/scripts/gcc-plugins/sancov_plugin.c
new file mode 100644 (file)
index 0000000..aedd611
--- /dev/null
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2016 by Emese Revfy <re.emese@gmail.com>
+ * Licensed under the GPL v2, or (at your option) v3
+ *
+ * Homepage:
+ * https://github.com/ephox-gcc-plugins/sancov
+ *
+ * This plugin inserts a __sanitizer_cov_trace_pc() call at the start of basic blocks.
+ * It supports all gcc versions with plugin support (from gcc-4.5 on).
+ * It is based on the commit "Add fuzzing coverage support" by Dmitry Vyukov <dvyukov@google.com>.
+ *
+ * You can read about it more here:
+ *  https://gcc.gnu.org/viewcvs/gcc?limit_changes=0&view=revision&revision=231296
+ *  http://lwn.net/Articles/674854/
+ *  https://github.com/google/syzkaller
+ *  https://lwn.net/Articles/677764/
+ *
+ * Usage:
+ * make run
+ */
+
+#include "gcc-common.h"
+
+int plugin_is_GPL_compatible;
+
+tree sancov_fndecl;
+
+static struct plugin_info sancov_plugin_info = {
+       .version        = "20160402",
+       .help           = "sancov plugin\n",
+};
+
+static unsigned int sancov_execute(void)
+{
+       basic_block bb;
+
+       /* Remove this line when this plugin and kcov will be in the kernel.
+       if (!strcmp(DECL_NAME_POINTER(current_function_decl), DECL_NAME_POINTER(sancov_fndecl)))
+               return 0;
+       */
+
+       FOR_EACH_BB_FN(bb, cfun) {
+               const_gimple stmt;
+               gcall *gcall;
+               gimple_stmt_iterator gsi = gsi_after_labels(bb);
+
+               if (gsi_end_p(gsi))
+                       continue;
+
+               stmt = gsi_stmt(gsi);
+               gcall = as_a_gcall(gimple_build_call(sancov_fndecl, 0));
+               gimple_set_location(gcall, gimple_location(stmt));
+               gsi_insert_before(&gsi, gcall, GSI_SAME_STMT);
+       }
+       return 0;
+}
+
+#define PASS_NAME sancov
+
+#define NO_GATE
+#define TODO_FLAGS_FINISH TODO_dump_func | TODO_verify_stmts | TODO_update_ssa_no_phi | TODO_verify_flow
+
+#include "gcc-generate-gimple-pass.h"
+
+static void sancov_start_unit(void __unused *gcc_data, void __unused *user_data)
+{
+       tree leaf_attr, nothrow_attr;
+       tree BT_FN_VOID = build_function_type_list(void_type_node, NULL_TREE);
+
+       sancov_fndecl = build_fn_decl("__sanitizer_cov_trace_pc", BT_FN_VOID);
+
+       DECL_ASSEMBLER_NAME(sancov_fndecl);
+       TREE_PUBLIC(sancov_fndecl) = 1;
+       DECL_EXTERNAL(sancov_fndecl) = 1;
+       DECL_ARTIFICIAL(sancov_fndecl) = 1;
+       DECL_PRESERVE_P(sancov_fndecl) = 1;
+       DECL_UNINLINABLE(sancov_fndecl) = 1;
+       TREE_USED(sancov_fndecl) = 1;
+
+       nothrow_attr = tree_cons(get_identifier("nothrow"), NULL, NULL);
+       decl_attributes(&sancov_fndecl, nothrow_attr, 0);
+       gcc_assert(TREE_NOTHROW(sancov_fndecl));
+#if BUILDING_GCC_VERSION > 4005
+       leaf_attr = tree_cons(get_identifier("leaf"), NULL, NULL);
+       decl_attributes(&sancov_fndecl, leaf_attr, 0);
+#endif
+}
+
+int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gcc_version *version)
+{
+       int i;
+       struct register_pass_info sancov_plugin_pass_info;
+       const char * const plugin_name = plugin_info->base_name;
+       const int argc = plugin_info->argc;
+       const struct plugin_argument * const argv = plugin_info->argv;
+       bool enable = true;
+
+       static const struct ggc_root_tab gt_ggc_r_gt_sancov[] = {
+               {
+                       .base = &sancov_fndecl,
+                       .nelt = 1,
+                       .stride = sizeof(sancov_fndecl),
+                       .cb = &gt_ggc_mx_tree_node,
+                       .pchw = &gt_pch_nx_tree_node
+               },
+               LAST_GGC_ROOT_TAB
+       };
+
+       /* BBs can be split afterwards?? */
+       sancov_plugin_pass_info.pass                            = make_sancov_pass();
+#if BUILDING_GCC_VERSION >= 4009
+       sancov_plugin_pass_info.reference_pass_name             = "asan";
+#else
+       sancov_plugin_pass_info.reference_pass_name             = "nrv";
+#endif
+       sancov_plugin_pass_info.ref_pass_instance_number        = 0;
+       sancov_plugin_pass_info.pos_op                          = PASS_POS_INSERT_BEFORE;
+
+       if (!plugin_default_version_check(version, &gcc_version)) {
+               error(G_("incompatible gcc/plugin versions"));
+               return 1;
+       }
+
+       for (i = 0; i < argc; ++i) {
+               if (!strcmp(argv[i].key, "no-sancov")) {
+                       enable = false;
+                       continue;
+               }
+               error(G_("unkown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key);
+       }
+
+       register_callback(plugin_name, PLUGIN_INFO, NULL, &sancov_plugin_info);
+
+       if (!enable)
+               return 0;
+
+#if BUILDING_GCC_VERSION < 6000
+       register_callback(plugin_name, PLUGIN_START_UNIT, &sancov_start_unit, NULL);
+       register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS, NULL, (void *)&gt_ggc_r_gt_sancov);
+       register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, &sancov_plugin_pass_info);
+#endif
+
+       return 0;
+}
index f0f6d9d..4f727eb 100755 (executable)
@@ -180,7 +180,7 @@ else
 fi;
 
 # final build of init/
-${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init
+${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init GCC_PLUGINS_CFLAGS="${GCC_PLUGINS_CFLAGS}"
 
 kallsymso=""
 kallsyms_vmlinux=""
index 86e56fe..e1c09e2 100755 (executable)
@@ -26,6 +26,8 @@ create_package() {
        # Fix ownership and permissions
        chown -R root:root "$pdir"
        chmod -R go-w "$pdir"
+       # in case we are in a restrictive umask environment like 0077
+       chmod -R a+rX "$pdir"
 
        # Create the package
        dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch}" -p$pname -P"$pdir"
@@ -238,7 +240,8 @@ maintainer="$name <$email>"
 # Try to determine distribution
 if [ -n "$KDEB_CHANGELOG_DIST" ]; then
         distribution=$KDEB_CHANGELOG_DIST
-elif distribution=$(lsb_release -cs 2>/dev/null) && [ -n "$distribution" ]; then
+# In some cases lsb_release returns the codename as n/a, which breaks dpkg-parsechangelog
+elif distribution=$(lsb_release -cs 2>/dev/null) && [ -n "$distribution" ] && [ "$distribution" != "n/a" ]; then
         : # nothing to do in this case
 else
         distribution="unstable"
@@ -322,13 +325,14 @@ fi
 
 # Build kernel header package
 (cd $srctree; find . -name Makefile\* -o -name Kconfig\* -o -name \*.pl) > "$objtree/debian/hdrsrcfiles"
-if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then
-       (cd $srctree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrsrcfiles"
-fi
 (cd $srctree; find arch/*/include include scripts -type f) >> "$objtree/debian/hdrsrcfiles"
 (cd $srctree; find arch/$SRCARCH -name module.lds -o -name Kbuild.platforms -o -name Platform) >> "$objtree/debian/hdrsrcfiles"
 (cd $srctree; find $(find arch/$SRCARCH -name include -o -name scripts -type d) -type f) >> "$objtree/debian/hdrsrcfiles"
+if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then
+       (cd $objtree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrobjfiles"
+fi
 (cd $objtree; find arch/$SRCARCH/include Module.symvers include scripts -type f) >> "$objtree/debian/hdrobjfiles"
+(cd $objtree; find scripts/gcc-plugins -name \*.so -o -name gcc-common.h) >> "$objtree/debian/hdrobjfiles"
 destdir=$kernel_headers_dir/usr/src/linux-headers-$version
 mkdir -p "$destdir"
 (cd $srctree; tar -c -f - -T -) < "$objtree/debian/hdrsrcfiles" | (cd $destdir; tar -xf -)
index 63d91e2..966dd39 100755 (executable)
@@ -143,7 +143,7 @@ fi
 if test -e include/config/auto.conf; then
        . include/config/auto.conf
 else
-       echo "Error: kernelrelease not valid - run 'make prepare' to update it"
+       echo "Error: kernelrelease not valid - run 'make prepare' to update it" >&2
        exit 1
 fi
 
index e5d6108..b0cc1a3 100644 (file)
@@ -16,9 +16,6 @@ config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
 
-config KVM_APIC_ARCHITECTURE
-       bool
-
 config KVM_MMIO
        bool
 
index 3a3a699..7cffd93 100644 (file)
 
 #include <asm/kvm_hyp.h>
 
-#ifdef CONFIG_KVM_NEW_VGIC
-extern struct vgic_global kvm_vgic_global_state;
-#define vgic_v2_params kvm_vgic_global_state
-#else
-extern struct vgic_params vgic_v2_params;
-#endif
-
 static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
                                            void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
        u32 eisr0, eisr1;
        int i;
        bool expect_mi;
@@ -74,7 +67,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
 static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
        u32 elrsr0, elrsr1;
 
        elrsr0 = readl_relaxed(base + GICH_ELRSR0);
@@ -93,7 +86,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
        int i;
 
        for (i = 0; i < nr_lr; i++) {
@@ -147,7 +140,7 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
        struct vgic_dist *vgic = &kvm->arch.vgic;
        void __iomem *base = kern_hyp_va(vgic->vctrl_base);
-       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
        int i;
        u64 live_lrs = 0;
 
diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
deleted file mode 100644 (file)
index 1b0bee0..0000000
+++ /dev/null
@@ -1,856 +0,0 @@
-/*
- * Contains GICv2 specific emulation code, was in vgic.c before.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-#define GICC_ARCH_VERSION_V2           0x2
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
-static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
-{
-       return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;
-}
-
-static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
-                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg;
-       u32 word_offset = offset & 3;
-
-       switch (offset & ~3) {
-       case 0:                 /* GICD_CTLR */
-               reg = vcpu->kvm->arch.vgic.enabled;
-               vgic_reg_access(mmio, &reg, word_offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-               if (mmio->is_write) {
-                       vcpu->kvm->arch.vgic.enabled = reg & 1;
-                       vgic_update_state(vcpu->kvm);
-                       return true;
-               }
-               break;
-
-       case 4:                 /* GICD_TYPER */
-               reg  = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
-               reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
-               vgic_reg_access(mmio, &reg, word_offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               break;
-
-       case 8:                 /* GICD_IIDR */
-               reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-               vgic_reg_access(mmio, &reg, word_offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               break;
-       }
-
-       return false;
-}
-
-static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
-                                      struct kvm_exit_mmio *mmio,
-                                      phys_addr_t offset)
-{
-       return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                     vcpu->vcpu_id, ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
-                                        struct kvm_exit_mmio *mmio,
-                                        phys_addr_t offset)
-{
-       return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                     vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
-                                       struct kvm_exit_mmio *mmio,
-                                       phys_addr_t offset)
-{
-       return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-                                          vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
-                                         struct kvm_exit_mmio *mmio,
-                                         phys_addr_t offset)
-{
-       return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-                                            vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
-                                      struct kvm_exit_mmio *mmio,
-                                      phys_addr_t offset)
-{
-       return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-                                         vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
-                                        struct kvm_exit_mmio *mmio,
-                                        phys_addr_t offset)
-{
-       return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-                                           vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
-                                    struct kvm_exit_mmio *mmio,
-                                    phys_addr_t offset)
-{
-       u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-                                       vcpu->vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       return false;
-}
-
-#define GICD_ITARGETSR_SIZE    32
-#define GICD_CPUTARGETS_BITS   8
-#define GICD_IRQS_PER_ITARGETSR        (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
-static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int i;
-       u32 val = 0;
-
-       irq -= VGIC_NR_PRIVATE_IRQS;
-
-       for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-               val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
-
-       return val;
-}
-
-static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int i, c;
-       unsigned long *bmap;
-       u32 target;
-
-       irq -= VGIC_NR_PRIVATE_IRQS;
-
-       /*
-        * Pick the LSB in each byte. This ensures we target exactly
-        * one vcpu per IRQ. If the byte is null, assume we target
-        * CPU0.
-        */
-       for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
-               int shift = i * GICD_CPUTARGETS_BITS;
-
-               target = ffs((val >> shift) & 0xffU);
-               target = target ? (target - 1) : 0;
-               dist->irq_spi_cpu[irq + i] = target;
-               kvm_for_each_vcpu(c, vcpu, kvm) {
-                       bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
-                       if (c == target)
-                               set_bit(irq + i, bmap);
-                       else
-                               clear_bit(irq + i, bmap);
-               }
-       }
-}
-
-static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
-                                  struct kvm_exit_mmio *mmio,
-                                  phys_addr_t offset)
-{
-       u32 reg;
-
-       /* We treat the banked interrupts targets as read-only */
-       if (offset < 32) {
-               u32 roreg;
-
-               roreg = 1 << vcpu->vcpu_id;
-               roreg |= roreg << 8;
-               roreg |= roreg << 16;
-
-               vgic_reg_access(mmio, &roreg, offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       if (mmio->is_write) {
-               vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
-               vgic_update_state(vcpu->kvm);
-               return true;
-       }
-
-       return false;
-}
-
-static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
-                               struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 *reg;
-
-       reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-                                 vcpu->vcpu_id, offset >> 1);
-
-       return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
-                               struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
-       if (mmio->is_write) {
-               vgic_dispatch_sgi(vcpu, reg);
-               vgic_update_state(vcpu->kvm);
-               return true;
-       }
-
-       return false;
-}
-
-/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
-static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
-                                       struct kvm_exit_mmio *mmio,
-                                       phys_addr_t offset)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       int sgi;
-       int min_sgi = (offset & ~0x3);
-       int max_sgi = min_sgi + 3;
-       int vcpu_id = vcpu->vcpu_id;
-       u32 reg = 0;
-
-       /* Copy source SGIs from distributor side */
-       for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
-               u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
-               reg |= ((u32)sources) << (8 * (sgi - min_sgi));
-       }
-
-       mmio_data_write(mmio, ~0, reg);
-       return false;
-}
-
-static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
-                                        struct kvm_exit_mmio *mmio,
-                                        phys_addr_t offset, bool set)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       int sgi;
-       int min_sgi = (offset & ~0x3);
-       int max_sgi = min_sgi + 3;
-       int vcpu_id = vcpu->vcpu_id;
-       u32 reg;
-       bool updated = false;
-
-       reg = mmio_data_read(mmio, ~0);
-
-       /* Clear pending SGIs on the distributor */
-       for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
-               u8 mask = reg >> (8 * (sgi - min_sgi));
-               u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
-               if (set) {
-                       if ((*src & mask) != mask)
-                               updated = true;
-                       *src |= mask;
-               } else {
-                       if (*src & mask)
-                               updated = true;
-                       *src &= ~mask;
-               }
-       }
-
-       if (updated)
-               vgic_update_state(vcpu->kvm);
-
-       return updated;
-}
-
-static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
-                               struct kvm_exit_mmio *mmio,
-                               phys_addr_t offset)
-{
-       if (!mmio->is_write)
-               return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
-       else
-               return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
-}
-
-static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset)
-{
-       if (!mmio->is_write)
-               return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
-       else
-               return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
-}
-
-static const struct vgic_io_range vgic_dist_ranges[] = {
-       {
-               .base           = GIC_DIST_SOFTINT,
-               .len            = 4,
-               .handle_mmio    = handle_mmio_sgi_reg,
-       },
-       {
-               .base           = GIC_DIST_CTRL,
-               .len            = 12,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_misc,
-       },
-       {
-               .base           = GIC_DIST_IGROUP,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GIC_DIST_ENABLE_SET,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_enable_reg,
-       },
-       {
-               .base           = GIC_DIST_ENABLE_CLEAR,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_enable_reg,
-       },
-       {
-               .base           = GIC_DIST_PENDING_SET,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_pending_reg,
-       },
-       {
-               .base           = GIC_DIST_PENDING_CLEAR,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_pending_reg,
-       },
-       {
-               .base           = GIC_DIST_ACTIVE_SET,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_active_reg,
-       },
-       {
-               .base           = GIC_DIST_ACTIVE_CLEAR,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_active_reg,
-       },
-       {
-               .base           = GIC_DIST_PRI,
-               .len            = VGIC_MAX_IRQS,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_priority_reg,
-       },
-       {
-               .base           = GIC_DIST_TARGET,
-               .len            = VGIC_MAX_IRQS,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_target_reg,
-       },
-       {
-               .base           = GIC_DIST_CONFIG,
-               .len            = VGIC_MAX_IRQS / 4,
-               .bits_per_irq   = 2,
-               .handle_mmio    = handle_mmio_cfg_reg,
-       },
-       {
-               .base           = GIC_DIST_SGI_PENDING_CLEAR,
-               .len            = VGIC_NR_SGIS,
-               .handle_mmio    = handle_mmio_sgi_clear,
-       },
-       {
-               .base           = GIC_DIST_SGI_PENDING_SET,
-               .len            = VGIC_NR_SGIS,
-               .handle_mmio    = handle_mmio_sgi_set,
-       },
-       {}
-};
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int nrcpus = atomic_read(&kvm->online_vcpus);
-       u8 target_cpus;
-       int sgi, mode, c, vcpu_id;
-
-       vcpu_id = vcpu->vcpu_id;
-
-       sgi = reg & 0xf;
-       target_cpus = (reg >> 16) & 0xff;
-       mode = (reg >> 24) & 3;
-
-       switch (mode) {
-       case 0:
-               if (!target_cpus)
-                       return;
-               break;
-
-       case 1:
-               target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
-               break;
-
-       case 2:
-               target_cpus = 1 << vcpu_id;
-               break;
-       }
-
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               if (target_cpus & 1) {
-                       /* Flag the SGI as pending */
-                       vgic_dist_irq_set_pending(vcpu, sgi);
-                       *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
-                       kvm_debug("SGI%d from CPU%d to CPU%d\n",
-                                 sgi, vcpu_id, c);
-               }
-
-               target_cpus >>= 1;
-       }
-}
-
-static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long sources;
-       int vcpu_id = vcpu->vcpu_id;
-       int c;
-
-       sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
-
-       for_each_set_bit(c, &sources, dist->nr_cpus) {
-               if (vgic_queue_irq(vcpu, c, irq))
-                       clear_bit(c, &sources);
-       }
-
-       *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
-
-       /*
-        * If the sources bitmap has been cleared it means that we
-        * could queue all the SGIs onto link registers (see the
-        * clear_bit above), and therefore we are done with them in
-        * our emulated gic and can get rid of them.
-        */
-       if (!sources) {
-               vgic_dist_irq_clear_pending(vcpu, irq);
-               vgic_cpu_irq_clear(vcpu, irq);
-               return true;
-       }
-
-       return false;
-}
-
-/**
- * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
- * @kvm: pointer to the kvm struct
- *
- * Map the virtual CPU interface into the VM before running any VCPUs.  We
- * can't do this at creation time, because user space must first set the
- * virtual CPU interface address in the guest physical address space.
- */
-static int vgic_v2_map_resources(struct kvm *kvm,
-                                const struct vgic_params *params)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int ret = 0;
-
-       if (!irqchip_in_kernel(kvm))
-               return 0;
-
-       mutex_lock(&kvm->lock);
-
-       if (vgic_ready(kvm))
-               goto out;
-
-       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-           IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
-               kvm_err("Need to set vgic cpu and dist addresses first\n");
-               ret = -ENXIO;
-               goto out;
-       }
-
-       vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
-                                KVM_VGIC_V2_DIST_SIZE,
-                                vgic_dist_ranges, -1, &dist->dist_iodev);
-
-       /*
-        * Initialize the vgic if this hasn't already been done on demand by
-        * accessing the vgic state from userspace.
-        */
-       ret = vgic_init(kvm);
-       if (ret) {
-               kvm_err("Unable to allocate maps\n");
-               goto out_unregister;
-       }
-
-       ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
-                                   params->vcpu_base, KVM_VGIC_V2_CPU_SIZE,
-                                   true);
-       if (ret) {
-               kvm_err("Unable to remap VGIC CPU to VCPU\n");
-               goto out_unregister;
-       }
-
-       dist->ready = true;
-       goto out;
-
-out_unregister:
-       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
-
-out:
-       if (ret)
-               kvm_vgic_destroy(kvm);
-       mutex_unlock(&kvm->lock);
-       return ret;
-}
-
-static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       *vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source;
-}
-
-static int vgic_v2_init_model(struct kvm *kvm)
-{
-       int i;
-
-       for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
-               vgic_set_target_reg(kvm, 0, i);
-
-       return 0;
-}
-
-void vgic_v2_init_emulation(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       dist->vm_ops.queue_sgi = vgic_v2_queue_sgi;
-       dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source;
-       dist->vm_ops.init_model = vgic_v2_init_model;
-       dist->vm_ops.map_resources = vgic_v2_map_resources;
-
-       kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
-}
-
-static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
-                                struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       bool updated = false;
-       struct vgic_vmcr vmcr;
-       u32 *vmcr_field;
-       u32 reg;
-
-       vgic_get_vmcr(vcpu, &vmcr);
-
-       switch (offset & ~0x3) {
-       case GIC_CPU_CTRL:
-               vmcr_field = &vmcr.ctlr;
-               break;
-       case GIC_CPU_PRIMASK:
-               vmcr_field = &vmcr.pmr;
-               break;
-       case GIC_CPU_BINPOINT:
-               vmcr_field = &vmcr.bpr;
-               break;
-       case GIC_CPU_ALIAS_BINPOINT:
-               vmcr_field = &vmcr.abpr;
-               break;
-       default:
-               BUG();
-       }
-
-       if (!mmio->is_write) {
-               reg = *vmcr_field;
-               mmio_data_write(mmio, ~0, reg);
-       } else {
-               reg = mmio_data_read(mmio, ~0);
-               if (reg != *vmcr_field) {
-                       *vmcr_field = reg;
-                       vgic_set_vmcr(vcpu, &vmcr);
-                       updated = true;
-               }
-       }
-       return updated;
-}
-
-static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
-                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
-}
-
-static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset)
-{
-       u32 reg;
-
-       if (mmio->is_write)
-               return false;
-
-       /* GICC_IIDR */
-       reg = (PRODUCT_ID_KVM << 20) |
-             (GICC_ARCH_VERSION_V2 << 16) |
-             (IMPLEMENTER_ARM << 0);
-       mmio_data_write(mmio, ~0, reg);
-       return false;
-}
-
-/*
- * CPU Interface Register accesses - these are not accessed by the VM, but by
- * user space for saving and restoring VGIC state.
- */
-static const struct vgic_io_range vgic_cpu_ranges[] = {
-       {
-               .base           = GIC_CPU_CTRL,
-               .len            = 12,
-               .handle_mmio    = handle_cpu_mmio_misc,
-       },
-       {
-               .base           = GIC_CPU_ALIAS_BINPOINT,
-               .len            = 4,
-               .handle_mmio    = handle_mmio_abpr,
-       },
-       {
-               .base           = GIC_CPU_ACTIVEPRIO,
-               .len            = 16,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GIC_CPU_IDENT,
-               .len            = 4,
-               .handle_mmio    = handle_cpu_mmio_ident,
-       },
-};
-
-static int vgic_attr_regs_access(struct kvm_device *dev,
-                                struct kvm_device_attr *attr,
-                                u32 *reg, bool is_write)
-{
-       const struct vgic_io_range *r = NULL, *ranges;
-       phys_addr_t offset;
-       int ret, cpuid, c;
-       struct kvm_vcpu *vcpu, *tmp_vcpu;
-       struct vgic_dist *vgic;
-       struct kvm_exit_mmio mmio;
-       u32 data;
-
-       offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-       cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
-               KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
-       mutex_lock(&dev->kvm->lock);
-
-       ret = vgic_init(dev->kvm);
-       if (ret)
-               goto out;
-
-       if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-       vgic = &dev->kvm->arch.vgic;
-
-       mmio.len = 4;
-       mmio.is_write = is_write;
-       mmio.data = &data;
-       if (is_write)
-               mmio_data_write(&mmio, ~0, *reg);
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-               mmio.phys_addr = vgic->vgic_dist_base + offset;
-               ranges = vgic_dist_ranges;
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               mmio.phys_addr = vgic->vgic_cpu_base + offset;
-               ranges = vgic_cpu_ranges;
-               break;
-       default:
-               BUG();
-       }
-       r = vgic_find_range(ranges, 4, offset);
-
-       if (unlikely(!r || !r->handle_mmio)) {
-               ret = -ENXIO;
-               goto out;
-       }
-
-
-       spin_lock(&vgic->lock);
-
-       /*
-        * Ensure that no other VCPU is running by checking the vcpu->cpu
-        * field.  If no other VPCUs are running we can safely access the VGIC
-        * state, because even if another VPU is run after this point, that
-        * VCPU will not touch the vgic state, because it will block on
-        * getting the vgic->lock in kvm_vgic_sync_hwstate().
-        */
-       kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
-               if (unlikely(tmp_vcpu->cpu != -1)) {
-                       ret = -EBUSY;
-                       goto out_vgic_unlock;
-               }
-       }
-
-       /*
-        * Move all pending IRQs from the LRs on all VCPUs so the pending
-        * state can be properly represented in the register state accessible
-        * through this API.
-        */
-       kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
-               vgic_unqueue_irqs(tmp_vcpu);
-
-       offset -= r->base;
-       r->handle_mmio(vcpu, &mmio, offset);
-
-       if (!is_write)
-               *reg = mmio_data_read(&mmio, ~0);
-
-       ret = 0;
-out_vgic_unlock:
-       spin_unlock(&vgic->lock);
-out:
-       mutex_unlock(&dev->kvm->lock);
-       return ret;
-}
-
-static int vgic_v2_create(struct kvm_device *dev, u32 type)
-{
-       return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v2_destroy(struct kvm_device *dev)
-{
-       kfree(dev);
-}
-
-static int vgic_v2_set_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_set_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 reg;
-
-               if (get_user(reg, uaddr))
-                       return -EFAULT;
-
-               return vgic_attr_regs_access(dev, attr, &reg, true);
-       }
-
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v2_get_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_get_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 reg = 0;
-
-               ret = vgic_attr_regs_access(dev, attr, &reg, false);
-               if (ret)
-                       return ret;
-               return put_user(reg, uaddr);
-       }
-
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v2_has_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       phys_addr_t offset;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR:
-               switch (attr->attr) {
-               case KVM_VGIC_V2_ADDR_TYPE_DIST:
-               case KVM_VGIC_V2_ADDR_TYPE_CPU:
-                       return 0;
-               }
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-               offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-               return vgic_has_attr_regs(vgic_dist_ranges, offset);
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-               return vgic_has_attr_regs(vgic_cpu_ranges, offset);
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-               return 0;
-       case KVM_DEV_ARM_VGIC_GRP_CTRL:
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       return 0;
-               }
-       }
-       return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
-       .name = "kvm-arm-vgic-v2",
-       .create = vgic_v2_create,
-       .destroy = vgic_v2_destroy,
-       .set_attr = vgic_v2_set_attr,
-       .get_attr = vgic_v2_get_attr,
-       .has_attr = vgic_v2_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
deleted file mode 100644 (file)
index 334cd7a..0000000
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-       struct vgic_lr lr_desc;
-       u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr];
-
-       lr_desc.irq     = val & GICH_LR_VIRTUALID;
-       if (lr_desc.irq <= 15)
-               lr_desc.source  = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
-       else
-               lr_desc.source = 0;
-       lr_desc.state   = 0;
-
-       if (val & GICH_LR_PENDING_BIT)
-               lr_desc.state |= LR_STATE_PENDING;
-       if (val & GICH_LR_ACTIVE_BIT)
-               lr_desc.state |= LR_STATE_ACTIVE;
-       if (val & GICH_LR_EOI)
-               lr_desc.state |= LR_EOI_INT;
-       if (val & GICH_LR_HW) {
-               lr_desc.state |= LR_HW;
-               lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
-       }
-
-       return lr_desc;
-}
-
-static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
-                          struct vgic_lr lr_desc)
-{
-       u32 lr_val;
-
-       lr_val = lr_desc.irq;
-
-       if (lr_desc.state & LR_STATE_PENDING)
-               lr_val |= GICH_LR_PENDING_BIT;
-       if (lr_desc.state & LR_STATE_ACTIVE)
-               lr_val |= GICH_LR_ACTIVE_BIT;
-       if (lr_desc.state & LR_EOI_INT)
-               lr_val |= GICH_LR_EOI;
-
-       if (lr_desc.state & LR_HW) {
-               lr_val |= GICH_LR_HW;
-               lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
-       }
-
-       if (lr_desc.irq < VGIC_NR_SGIS)
-               lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
-
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
-
-       if (!(lr_desc.state & LR_STATE_MASK))
-               vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
-       else
-               vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr);
-}
-
-static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr;
-}
-
-static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr;
-}
-
-static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0;
-}
-
-static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
-       u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr;
-       u32 ret = 0;
-
-       if (misr & GICH_MISR_EOI)
-               ret |= INT_STATUS_EOI;
-       if (misr & GICH_MISR_U)
-               ret |= INT_STATUS_UNDERFLOW;
-
-       return ret;
-}
-
-static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE;
-}
-
-static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE;
-}
-
-static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
-
-       vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT;
-       vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT;
-       vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT;
-       vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT;
-}
-
-static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       u32 vmcr;
-
-       vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
-       vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK;
-       vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK;
-       vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
-
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
-}
-
-static void vgic_v2_enable(struct kvm_vcpu *vcpu)
-{
-       /*
-        * By forcing VMCR to zero, the GIC will restore the binary
-        * points to their reset values. Anything else resets to zero
-        * anyway.
-        */
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
-
-       /* Get the show on the road... */
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v2_ops = {
-       .get_lr                 = vgic_v2_get_lr,
-       .set_lr                 = vgic_v2_set_lr,
-       .get_elrsr              = vgic_v2_get_elrsr,
-       .get_eisr               = vgic_v2_get_eisr,
-       .clear_eisr             = vgic_v2_clear_eisr,
-       .get_interrupt_status   = vgic_v2_get_interrupt_status,
-       .enable_underflow       = vgic_v2_enable_underflow,
-       .disable_underflow      = vgic_v2_disable_underflow,
-       .get_vmcr               = vgic_v2_get_vmcr,
-       .set_vmcr               = vgic_v2_set_vmcr,
-       .enable                 = vgic_v2_enable,
-};
-
-struct vgic_params __section(.hyp.text) vgic_v2_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
-       struct vgic_params *vgic = params;
-       int i;
-
-       for (i = 0; i < vgic->nr_lr; i++)
-               writel_relaxed(0, vgic->vctrl_base + GICH_LR0 + (i * 4));
-}
-
-/**
- * vgic_v2_probe - probe for a GICv2 compatible interrupt controller
- * @gic_kvm_info:      pointer to the GIC description
- * @ops:               address of a pointer to the GICv2 operations
- * @params:            address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv2 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
-                  const struct vgic_ops **ops,
-                  const struct vgic_params **params)
-{
-       int ret;
-       struct vgic_params *vgic = &vgic_v2_params;
-       const struct resource *vctrl_res = &gic_kvm_info->vctrl;
-       const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
-       memset(vgic, 0, sizeof(*vgic));
-
-       if (!gic_kvm_info->maint_irq) {
-               kvm_err("error getting vgic maintenance irq\n");
-               ret = -ENXIO;
-               goto out;
-       }
-       vgic->maint_irq = gic_kvm_info->maint_irq;
-
-       if (!gic_kvm_info->vctrl.start) {
-               kvm_err("GICH not present in the firmware table\n");
-               ret = -ENXIO;
-               goto out;
-       }
-
-       vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start,
-                                  resource_size(&gic_kvm_info->vctrl));
-       if (!vgic->vctrl_base) {
-               kvm_err("Cannot ioremap GICH\n");
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR);
-       vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1;
-
-       ret = create_hyp_io_mappings(vgic->vctrl_base,
-                                    vgic->vctrl_base + resource_size(vctrl_res),
-                                    vctrl_res->start);
-       if (ret) {
-               kvm_err("Cannot map VCTRL into hyp\n");
-               goto out_unmap;
-       }
-
-       if (!PAGE_ALIGNED(vcpu_res->start)) {
-               kvm_err("GICV physical address 0x%llx not page aligned\n",
-                       (unsigned long long)vcpu_res->start);
-               ret = -ENXIO;
-               goto out_unmap;
-       }
-
-       if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
-               kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-                       (unsigned long long)resource_size(vcpu_res),
-                       PAGE_SIZE);
-               ret = -ENXIO;
-               goto out_unmap;
-       }
-
-       vgic->can_emulate_gicv2 = true;
-       kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
-
-       vgic->vcpu_base = vcpu_res->start;
-
-       kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n",
-                gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq);
-
-       vgic->type = VGIC_V2;
-       vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS;
-
-       on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
-       *ops = &vgic_v2_ops;
-       *params = vgic;
-       goto out;
-
-out_unmap:
-       iounmap(vgic->vctrl_base);
-out:
-       return ret;
-}
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
deleted file mode 100644 (file)
index e661e7f..0000000
+++ /dev/null
@@ -1,1074 +0,0 @@
-/*
- * GICv3 distributor and redistributor emulation
- *
- * GICv3 emulation is currently only supported on a GICv3 host (because
- * we rely on the hardware's CPU interface virtualization support), but
- * supports both hardware with or without the optional GICv2 backwards
- * compatibility features.
- *
- * Limitations of the emulation:
- * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
- * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
- * - We do not support the message based interrupts (MBIs) triggered by
- *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
- * - We do not support the (optional) backwards compatibility feature.
- *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
- *   the compatiblity feature, you can use a GICv2 in the guest, though.
- * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
- * - Priorities are not emulated (same as the GICv2 emulation). Linux
- *   as a guest is fine with this, because it does not use priorities.
- * - We only support Group1 interrupts. Again Linux uses only those.
- *
- * Copyright (C) 2014 ARM Ltd.
- * Author: Andre Przywara <andre.przywara@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <kvm/arm_vgic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
-                              struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg = 0xffffffff;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-       return false;
-}
-
-static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
-                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg = 0;
-
-       /*
-        * Force ARE and DS to 1, the guest cannot change this.
-        * For the time being we only support Group1 interrupts.
-        */
-       if (vcpu->kvm->arch.vgic.enabled)
-               reg = GICD_CTLR_ENABLE_SS_G1;
-       reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       if (mmio->is_write) {
-               vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
-               vgic_update_state(vcpu->kvm);
-               return true;
-       }
-       return false;
-}
-
-/*
- * As this implementation does not provide compatibility
- * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
- * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
- * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
- */
-#define INTERRUPT_ID_BITS 10
-static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
-                             struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg;
-
-       reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
-
-       reg |= (INTERRUPT_ID_BITS - 1) << 19;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-       return false;
-}
-
-static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
-                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg;
-
-       reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-       return false;
-}
-
-static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
-                                           struct kvm_exit_mmio *mmio,
-                                           phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                             vcpu->vcpu_id,
-                                             ACCESS_WRITE_SETBIT);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
-                                             struct kvm_exit_mmio *mmio,
-                                             phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                             vcpu->vcpu_id,
-                                             ACCESS_WRITE_CLEARBIT);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
-                                            struct kvm_exit_mmio *mmio,
-                                            phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-                                                  vcpu->vcpu_id);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
-                                              struct kvm_exit_mmio *mmio,
-                                              phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-                                                    vcpu->vcpu_id);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu,
-                                           struct kvm_exit_mmio *mmio,
-                                           phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-                                                  vcpu->vcpu_id);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu,
-                                             struct kvm_exit_mmio *mmio,
-                                             phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-                                                   vcpu->vcpu_id);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
-                                         struct kvm_exit_mmio *mmio,
-                                         phys_addr_t offset)
-{
-       u32 *reg;
-
-       if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
-               vgic_reg_access(mmio, NULL, offset,
-                               ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-                                  vcpu->vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-               ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       return false;
-}
-
-static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
-                                    struct kvm_exit_mmio *mmio,
-                                    phys_addr_t offset)
-{
-       u32 *reg;
-
-       if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
-               vgic_reg_access(mmio, NULL, offset,
-                               ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-                                 vcpu->vcpu_id, offset >> 1);
-
-       return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-/*
- * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
- * when we store the target MPIDR written by the guest.
- */
-static u32 compress_mpidr(unsigned long mpidr)
-{
-       u32 ret;
-
-       ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-       ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
-       ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
-       ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
-
-       return ret;
-}
-
-static unsigned long uncompress_mpidr(u32 value)
-{
-       unsigned long mpidr;
-
-       mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
-       mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
-       mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
-       mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
-
-       return mpidr;
-}
-
-/*
- * Lookup the given MPIDR value to get the vcpu_id (if there is one)
- * and store that in the irq_spi_cpu[] array.
- * This limits the number of VCPUs to 255 for now, extending the data
- * type (or storing kvm_vcpu pointers) should lift the limit.
- * Store the original MPIDR value in an extra array to support read-as-written.
- * Unallocated MPIDRs are translated to a special value and caught
- * before any array accesses.
- */
-static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int spi;
-       u32 reg;
-       int vcpu_id;
-       unsigned long *bmap, mpidr;
-
-       /*
-        * The upper 32 bits of each 64 bit register are zero,
-        * as we don't support Aff3.
-        */
-       if ((offset & 4)) {
-               vgic_reg_access(mmio, NULL, offset,
-                               ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       /* This region only covers SPIs, so no handling of private IRQs here. */
-       spi = offset / 8;
-
-       /* get the stored MPIDR for this IRQ */
-       mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
-       reg = mpidr;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-
-       if (!mmio->is_write)
-               return false;
-
-       /*
-        * Now clear the currently assigned vCPU from the map, making room
-        * for the new one to be written below
-        */
-       vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
-       if (likely(vcpu)) {
-               vcpu_id = vcpu->vcpu_id;
-               bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
-               __clear_bit(spi, bmap);
-       }
-
-       dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
-       vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
-
-       /*
-        * The spec says that non-existent MPIDR values should not be
-        * forwarded to any existent (v)CPU, but should be able to become
-        * pending anyway. We simply keep the irq_spi_target[] array empty, so
-        * the interrupt will never be injected.
-        * irq_spi_cpu[irq] gets a magic value in this case.
-        */
-       if (likely(vcpu)) {
-               vcpu_id = vcpu->vcpu_id;
-               dist->irq_spi_cpu[spi] = vcpu_id;
-               bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
-               __set_bit(spi, bmap);
-       } else {
-               dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
-       }
-
-       vgic_update_state(kvm);
-
-       return true;
-}
-
-/*
- * We should be careful about promising too much when a guest reads
- * this register. Don't claim to be like any hardware implementation,
- * but just report the GIC as version 3 - which is what a Linux guest
- * would check.
- */
-static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
-                              struct kvm_exit_mmio *mmio,
-                              phys_addr_t offset)
-{
-       u32 reg = 0;
-
-       switch (offset + GICD_IDREGS) {
-       case GICD_PIDR2:
-               reg = 0x3b;
-               break;
-       }
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-       return false;
-}
-
-static const struct vgic_io_range vgic_v3_dist_ranges[] = {
-       {
-               .base           = GICD_CTLR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_ctlr,
-       },
-       {
-               .base           = GICD_TYPER,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_typer,
-       },
-       {
-               .base           = GICD_IIDR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_iidr,
-       },
-       {
-               /* this register is optional, it is RAZ/WI if not implemented */
-               .base           = GICD_STATUSR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this write only register is WI when TYPER.MBIS=0 */
-               .base           = GICD_SETSPI_NSR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this write only register is WI when TYPER.MBIS=0 */
-               .base           = GICD_CLRSPI_NSR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when DS=1 */
-               .base           = GICD_SETSPI_SR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when DS=1 */
-               .base           = GICD_CLRSPI_SR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICD_IGROUPR,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_rao_wi,
-       },
-       {
-               .base           = GICD_ISENABLER,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_enable_reg_dist,
-       },
-       {
-               .base           = GICD_ICENABLER,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_enable_reg_dist,
-       },
-       {
-               .base           = GICD_ISPENDR,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_pending_reg_dist,
-       },
-       {
-               .base           = GICD_ICPENDR,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_pending_reg_dist,
-       },
-       {
-               .base           = GICD_ISACTIVER,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_active_reg_dist,
-       },
-       {
-               .base           = GICD_ICACTIVER,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_active_reg_dist,
-       },
-       {
-               .base           = GICD_IPRIORITYR,
-               .len            = 0x400,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_priority_reg_dist,
-       },
-       {
-               /* TARGETSRn is RES0 when ARE=1 */
-               .base           = GICD_ITARGETSR,
-               .len            = 0x400,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICD_ICFGR,
-               .len            = 0x100,
-               .bits_per_irq   = 2,
-               .handle_mmio    = handle_mmio_cfg_reg_dist,
-       },
-       {
-               /* this is RAZ/WI when DS=1 */
-               .base           = GICD_IGRPMODR,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when DS=1 */
-               .base           = GICD_NSACR,
-               .len            = 0x100,
-               .bits_per_irq   = 2,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when ARE=1 */
-               .base           = GICD_SGIR,
-               .len            = 0x04,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when ARE=1 */
-               .base           = GICD_CPENDSGIR,
-               .len            = 0x10,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when ARE=1 */
-               .base           = GICD_SPENDSGIR,
-               .len            = 0x10,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICD_IROUTER + 0x100,
-               .len            = 0x1ee0,
-               .bits_per_irq   = 64,
-               .handle_mmio    = handle_mmio_route_reg,
-       },
-       {
-               .base           = GICD_IDREGS,
-               .len            = 0x30,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_idregs,
-       },
-       {},
-};
-
-static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
-                                   struct kvm_exit_mmio *mmio,
-                                   phys_addr_t offset)
-{
-       /* since we don't support LPIs, this register is zero for now */
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
-                                    struct kvm_exit_mmio *mmio,
-                                    phys_addr_t offset)
-{
-       u32 reg;
-       u64 mpidr;
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-       int target_vcpu_id = redist_vcpu->vcpu_id;
-
-       /* the upper 32 bits contain the affinity value */
-       if ((offset & ~3) == 4) {
-               mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
-               reg = compress_mpidr(mpidr);
-
-               vgic_reg_access(mmio, &reg, offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = redist_vcpu->vcpu_id << 8;
-       if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
-               reg |= GICR_TYPER_LAST;
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
-                                             struct kvm_exit_mmio *mmio,
-                                             phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                     redist_vcpu->vcpu_id,
-                                     ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
-                                               struct kvm_exit_mmio *mmio,
-                                               phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                     redist_vcpu->vcpu_id,
-                                     ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu,
-                                             struct kvm_exit_mmio *mmio,
-                                             phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-                                         redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu,
-                                               struct kvm_exit_mmio *mmio,
-                                               phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-                                            redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
-                                              struct kvm_exit_mmio *mmio,
-                                              phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-                                          redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
-                                                struct kvm_exit_mmio *mmio,
-                                                phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-                                            redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
-                                           struct kvm_exit_mmio *mmio,
-                                           phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-       u32 *reg;
-
-       reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-                                  redist_vcpu->vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       return false;
-}
-
-static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
-                                      struct kvm_exit_mmio *mmio,
-                                      phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-                                      redist_vcpu->vcpu_id, offset >> 1);
-
-       return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-#define SGI_base(x) ((x) + SZ_64K)
-
-static const struct vgic_io_range vgic_redist_ranges[] = {
-       {
-               .base           = GICR_CTLR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_ctlr_redist,
-       },
-       {
-               .base           = GICR_TYPER,
-               .len            = 0x08,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_typer_redist,
-       },
-       {
-               .base           = GICR_IIDR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_iidr,
-       },
-       {
-               .base           = GICR_WAKER,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICR_IDREGS,
-               .len            = 0x30,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_idregs,
-       },
-       {
-               .base           = SGI_base(GICR_IGROUPR0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_rao_wi,
-       },
-       {
-               .base           = SGI_base(GICR_ISENABLER0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_enable_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ICENABLER0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_enable_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ISPENDR0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_pending_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ICPENDR0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_pending_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ISACTIVER0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_active_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ICACTIVER0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_active_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_IPRIORITYR0),
-               .len            = 0x20,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_priority_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ICFGR0),
-               .len            = 0x08,
-               .bits_per_irq   = 2,
-               .handle_mmio    = handle_mmio_cfg_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_IGRPMODR0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = SGI_base(GICR_NSACR),
-               .len            = 0x04,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {},
-};
-
-static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-       if (vgic_queue_irq(vcpu, 0, irq)) {
-               vgic_dist_irq_clear_pending(vcpu, irq);
-               vgic_cpu_irq_clear(vcpu, irq);
-               return true;
-       }
-
-       return false;
-}
-
-static int vgic_v3_map_resources(struct kvm *kvm,
-                                const struct vgic_params *params)
-{
-       int ret = 0;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       gpa_t rdbase = dist->vgic_redist_base;
-       struct vgic_io_device *iodevs = NULL;
-       int i;
-
-       if (!irqchip_in_kernel(kvm))
-               return 0;
-
-       mutex_lock(&kvm->lock);
-
-       if (vgic_ready(kvm))
-               goto out;
-
-       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-           IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
-               kvm_err("Need to set vgic distributor addresses first\n");
-               ret = -ENXIO;
-               goto out;
-       }
-
-       /*
-        * For a VGICv3 we require the userland to explicitly initialize
-        * the VGIC before we need to use it.
-        */
-       if (!vgic_initialized(kvm)) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
-                                      GIC_V3_DIST_SIZE, vgic_v3_dist_ranges,
-                                      -1, &dist->dist_iodev);
-       if (ret)
-               goto out;
-
-       iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL);
-       if (!iodevs) {
-               ret = -ENOMEM;
-               goto out_unregister;
-       }
-
-       for (i = 0; i < dist->nr_cpus; i++) {
-               ret = vgic_register_kvm_io_dev(kvm, rdbase,
-                                              SZ_128K, vgic_redist_ranges,
-                                              i, &iodevs[i]);
-               if (ret)
-                       goto out_unregister;
-               rdbase += GIC_V3_REDIST_SIZE;
-       }
-
-       dist->redist_iodevs = iodevs;
-       dist->ready = true;
-       goto out;
-
-out_unregister:
-       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
-       if (iodevs) {
-               for (i = 0; i < dist->nr_cpus; i++) {
-                       if (iodevs[i].dev.ops)
-                               kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-                                                         &iodevs[i].dev);
-               }
-       }
-
-out:
-       if (ret)
-               kvm_vgic_destroy(kvm);
-       mutex_unlock(&kvm->lock);
-       return ret;
-}
-
-static int vgic_v3_init_model(struct kvm *kvm)
-{
-       int i;
-       u32 mpidr;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
-       dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
-                                     GFP_KERNEL);
-
-       if (!dist->irq_spi_mpidr)
-               return -ENOMEM;
-
-       /* Initialize the target VCPUs for each IRQ to VCPU 0 */
-       mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
-       for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
-               dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
-               dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
-               vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
-       }
-
-       return 0;
-}
-
-/* GICv3 does not keep track of SGI sources anymore. */
-static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-}
-
-void vgic_v3_init_emulation(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
-       dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
-       dist->vm_ops.init_model = vgic_v3_init_model;
-       dist->vm_ops.map_resources = vgic_v3_map_resources;
-
-       kvm->arch.max_vcpus = KVM_MAX_VCPUS;
-}
-
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
-       unsigned long affinity;
-       int level0;
-
-       /*
-        * Split the current VCPU's MPIDR into affinity level 0 and the
-        * rest as this is what we have to compare against.
-        */
-       affinity = kvm_vcpu_get_mpidr_aff(vcpu);
-       level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
-       affinity &= ~MPIDR_LEVEL_MASK;
-
-       /* bail out if the upper three levels don't match */
-       if (sgi_aff != affinity)
-               return -1;
-
-       /* Is this VCPU's bit set in the mask ? */
-       if (!(sgi_cpu_mask & BIT(level0)))
-               return -1;
-
-       return level0;
-}
-
-#define SGI_AFFINITY_LEVEL(reg, level) \
-       ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
-       >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/**
- * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
- * @vcpu: The VCPU requesting a SGI
- * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
- *
- * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
- * This will trap in sys_regs.c and call this function.
- * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
- * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
- */
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct kvm_vcpu *c_vcpu;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       u16 target_cpus;
-       u64 mpidr;
-       int sgi, c;
-       int vcpu_id = vcpu->vcpu_id;
-       bool broadcast;
-       int updated = 0;
-
-       sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-       broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
-       target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
-       mpidr = SGI_AFFINITY_LEVEL(reg, 3);
-       mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
-       mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
-       /*
-        * We take the dist lock here, because we come from the sysregs
-        * code path and not from the MMIO one (which already takes the lock).
-        */
-       spin_lock(&dist->lock);
-
-       /*
-        * We iterate over all VCPUs to find the MPIDRs matching the request.
-        * If we have handled one CPU, we clear it's bit to detect early
-        * if we are already finished. This avoids iterating through all
-        * VCPUs when most of the times we just signal a single VCPU.
-        */
-       kvm_for_each_vcpu(c, c_vcpu, kvm) {
-
-               /* Exit early if we have dealt with all requested CPUs */
-               if (!broadcast && target_cpus == 0)
-                       break;
-
-                /* Don't signal the calling VCPU */
-               if (broadcast && c == vcpu_id)
-                       continue;
-
-               if (!broadcast) {
-                       int level0;
-
-                       level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
-                       if (level0 == -1)
-                               continue;
-
-                       /* remove this matching VCPU from the mask */
-                       target_cpus &= ~BIT(level0);
-               }
-
-               /* Flag the SGI as pending */
-               vgic_dist_irq_set_pending(c_vcpu, sgi);
-               updated = 1;
-               kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
-       }
-       if (updated)
-               vgic_update_state(vcpu->kvm);
-       spin_unlock(&dist->lock);
-       if (updated)
-               vgic_kick_vcpus(vcpu->kvm);
-}
-
-static int vgic_v3_create(struct kvm_device *dev, u32 type)
-{
-       return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v3_destroy(struct kvm_device *dev)
-{
-       kfree(dev);
-}
-
-static int vgic_v3_set_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_set_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               return -ENXIO;
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v3_get_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_get_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               return -ENXIO;
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v3_has_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR:
-               switch (attr->attr) {
-               case KVM_VGIC_V2_ADDR_TYPE_DIST:
-               case KVM_VGIC_V2_ADDR_TYPE_CPU:
-                       return -ENXIO;
-               case KVM_VGIC_V3_ADDR_TYPE_DIST:
-               case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-                       return 0;
-               }
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               return -ENXIO;
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-               return 0;
-       case KVM_DEV_ARM_VGIC_GRP_CTRL:
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       return 0;
-               }
-       }
-       return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v3_ops = {
-       .name = "kvm-arm-vgic-v3",
-       .create = vgic_v3_create,
-       .destroy = vgic_v3_destroy,
-       .set_attr = vgic_v3_set_attr,
-       .get_attr = vgic_v3_get_attr,
-       .has_attr = vgic_v3_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
deleted file mode 100644 (file)
index 75b02fa..0000000
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (C) 2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-
-static u32 ich_vtr_el2;
-
-static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-       struct vgic_lr lr_desc;
-       u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
-
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-               lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
-       else
-               lr_desc.irq = val & GICH_LR_VIRTUALID;
-
-       lr_desc.source = 0;
-       if (lr_desc.irq <= 15 &&
-           vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
-               lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
-
-       lr_desc.state = 0;
-
-       if (val & ICH_LR_PENDING_BIT)
-               lr_desc.state |= LR_STATE_PENDING;
-       if (val & ICH_LR_ACTIVE_BIT)
-               lr_desc.state |= LR_STATE_ACTIVE;
-       if (val & ICH_LR_EOI)
-               lr_desc.state |= LR_EOI_INT;
-       if (val & ICH_LR_HW) {
-               lr_desc.state |= LR_HW;
-               lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
-       }
-
-       return lr_desc;
-}
-
-static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
-                          struct vgic_lr lr_desc)
-{
-       u64 lr_val;
-
-       lr_val = lr_desc.irq;
-
-       /*
-        * Currently all guest IRQs are Group1, as Group0 would result
-        * in a FIQ in the guest, which it wouldn't expect.
-        * Eventually we want to make this configurable, so we may revisit
-        * this in the future.
-        */
-       switch (vcpu->kvm->arch.vgic.vgic_model) {
-       case KVM_DEV_TYPE_ARM_VGIC_V3:
-               lr_val |= ICH_LR_GROUP;
-               break;
-       case  KVM_DEV_TYPE_ARM_VGIC_V2:
-               if (lr_desc.irq < VGIC_NR_SGIS)
-                       lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
-               break;
-       default:
-               BUG();
-       }
-
-       if (lr_desc.state & LR_STATE_PENDING)
-               lr_val |= ICH_LR_PENDING_BIT;
-       if (lr_desc.state & LR_STATE_ACTIVE)
-               lr_val |= ICH_LR_ACTIVE_BIT;
-       if (lr_desc.state & LR_EOI_INT)
-               lr_val |= ICH_LR_EOI;
-       if (lr_desc.state & LR_HW) {
-               lr_val |= ICH_LR_HW;
-               lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
-       }
-
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = lr_val;
-
-       if (!(lr_desc.state & LR_STATE_MASK))
-               vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
-       else
-               vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr);
-}
-
-static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr;
-}
-
-static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr;
-}
-
-static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0;
-}
-
-static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
-       u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr;
-       u32 ret = 0;
-
-       if (misr & ICH_MISR_EOI)
-               ret |= INT_STATUS_EOI;
-       if (misr & ICH_MISR_U)
-               ret |= INT_STATUS_UNDERFLOW;
-
-       return ret;
-}
-
-static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
-
-       vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
-       vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-       vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-       vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-}
-
-static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE;
-}
-
-static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE;
-}
-
-static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       u32 vmcr;
-
-       vmcr  = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
-       vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
-       vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
-       vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
-
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
-}
-
-static void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       /*
-        * By forcing VMCR to zero, the GIC will restore the binary
-        * points to their reset values. Anything else resets to zero
-        * anyway.
-        */
-       vgic_v3->vgic_vmcr = 0;
-       vgic_v3->vgic_elrsr = ~0;
-
-       /*
-        * If we are emulating a GICv3, we do it in an non-GICv2-compatible
-        * way, so we force SRE to 1 to demonstrate this to the guest.
-        * This goes with the spec allowing the value to be RAO/WI.
-        */
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-               vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
-       else
-               vgic_v3->vgic_sre = 0;
-
-       /* Get the show on the road... */
-       vgic_v3->vgic_hcr = ICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v3_ops = {
-       .get_lr                 = vgic_v3_get_lr,
-       .set_lr                 = vgic_v3_set_lr,
-       .get_elrsr              = vgic_v3_get_elrsr,
-       .get_eisr               = vgic_v3_get_eisr,
-       .clear_eisr             = vgic_v3_clear_eisr,
-       .get_interrupt_status   = vgic_v3_get_interrupt_status,
-       .enable_underflow       = vgic_v3_enable_underflow,
-       .disable_underflow      = vgic_v3_disable_underflow,
-       .get_vmcr               = vgic_v3_get_vmcr,
-       .set_vmcr               = vgic_v3_set_vmcr,
-       .enable                 = vgic_v3_enable,
-};
-
-static struct vgic_params vgic_v3_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
-       kvm_call_hyp(__vgic_v3_init_lrs);
-}
-
-/**
- * vgic_v3_probe - probe for a GICv3 compatible interrupt controller
- * @gic_kvm_info:      pointer to the GIC description
- * @ops:               address of a pointer to the GICv3 operations
- * @params:            address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv3 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-                 const struct vgic_ops **ops,
-                 const struct vgic_params **params)
-{
-       int ret = 0;
-       struct vgic_params *vgic = &vgic_v3_params;
-       const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
-       vgic->maint_irq = gic_kvm_info->maint_irq;
-
-       ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
-
-       /*
-        * The ListRegs field is 5 bits, but there is a architectural
-        * maximum of 16 list registers. Just ignore bit 4...
-        */
-       vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
-       vgic->can_emulate_gicv2 = false;
-
-       if (!vcpu_res->start) {
-               kvm_info("GICv3: no GICV resource entry\n");
-               vgic->vcpu_base = 0;
-       } else if (!PAGE_ALIGNED(vcpu_res->start)) {
-               pr_warn("GICV physical address 0x%llx not page aligned\n",
-                       (unsigned long long)vcpu_res->start);
-               vgic->vcpu_base = 0;
-       } else if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
-               pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-                       (unsigned long long)resource_size(vcpu_res),
-                       PAGE_SIZE);
-       } else {
-               vgic->vcpu_base = vcpu_res->start;
-               vgic->can_emulate_gicv2 = true;
-               kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-                                       KVM_DEV_TYPE_ARM_VGIC_V2);
-       }
-       if (vgic->vcpu_base == 0)
-               kvm_info("disabling GICv2 emulation\n");
-       kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3);
-
-       vgic->vctrl_base = NULL;
-       vgic->type = VGIC_V3;
-       vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS;
-
-       kvm_info("GICV base=0x%llx, IRQ=%d\n",
-                vgic->vcpu_base, vgic->maint_irq);
-
-       on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
-       *ops = &vgic_v3_ops;
-       *params = vgic;
-
-       return ret;
-}
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
deleted file mode 100644 (file)
index 67cb5e9..0000000
+++ /dev/null
@@ -1,2417 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <trace/events/kvm.h>
-#include <asm/kvm.h>
-#include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-/*
- * How the whole thing works (courtesy of Christoffer Dall):
- *
- * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if
- *   something is pending on the CPU interface.
- * - Interrupts that are pending on the distributor are stored on the
- *   vgic.irq_pending vgic bitmap (this bitmap is updated by both user land
- *   ioctls and guest mmio ops, and other in-kernel peripherals such as the
- *   arch. timers).
- * - Every time the bitmap changes, the irq_pending_on_cpu oracle is
- *   recalculated
- * - To calculate the oracle, we need info for each cpu from
- *   compute_pending_for_cpu, which considers:
- *   - PPI: dist->irq_pending & dist->irq_enable
- *   - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target
- *   - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn
- *     registers, stored on each vcpu. We only keep one bit of
- *     information per interrupt, making sure that only one vcpu can
- *     accept the interrupt.
- * - If any of the above state changes, we must recalculate the oracle.
- * - The same is true when injecting an interrupt, except that we only
- *   consider a single interrupt at a time. The irq_spi_cpu array
- *   contains the target CPU for each SPI.
- *
- * The handling of level interrupts adds some extra complexity. We
- * need to track when the interrupt has been EOIed, so we can sample
- * the 'line' again. This is achieved as such:
- *
- * - When a level interrupt is moved onto a vcpu, the corresponding
- *   bit in irq_queued is set. As long as this bit is set, the line
- *   will be ignored for further interrupts. The interrupt is injected
- *   into the vcpu with the GICH_LR_EOI bit set (generate a
- *   maintenance interrupt on EOI).
- * - When the interrupt is EOIed, the maintenance interrupt fires,
- *   and clears the corresponding bit in irq_queued. This allows the
- *   interrupt line to be sampled again.
- * - Note that level-triggered interrupts can also be set to pending from
- *   writes to GICD_ISPENDRn and lowering the external input line does not
- *   cause the interrupt to become inactive in such a situation.
- *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
- *   inactive as long as the external input line is held high.
- *
- *
- * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.
- *
- * Distributor:
- *
- * - kvm_vgic_early_init(): initialization of static data that doesn't
- *   depend on any sizing information or emulation type. No allocation
- *   is allowed there.
- *
- * - vgic_init(): allocation and initialization of the generic data
- *   structures that depend on sizing information (number of CPUs,
- *   number of interrupts). Also initializes the vcpu specific data
- *   structures. Can be executed lazily for GICv2.
- *   [to be renamed to kvm_vgic_init??]
- *
- * CPU Interface:
- *
- * - kvm_vgic_cpu_early_init(): initialization of static data that
- *   doesn't depend on any sizing information or emulation type. No
- *   allocation is allowed there.
- */
-
-#include "vgic.h"
-
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
-static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
-                                               int virt_irq);
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
-
-static const struct vgic_ops *vgic_ops;
-static const struct vgic_params *vgic;
-
-static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-       vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source);
-}
-
-static bool queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-       return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq);
-}
-
-int kvm_vgic_map_resources(struct kvm *kvm)
-{
-       return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic);
-}
-
-/*
- * struct vgic_bitmap contains a bitmap made of unsigned longs, but
- * extracts u32s out of them.
- *
- * This does not work on 64-bit BE systems, because the bitmap access
- * will store two consecutive 32-bit words with the higher-addressed
- * register's bits at the lower index and the lower-addressed register's
- * bits at the higher index.
- *
- * Therefore, swizzle the register index when accessing the 32-bit word
- * registers to access the right register's value.
- */
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64
-#define REG_OFFSET_SWIZZLE     1
-#else
-#define REG_OFFSET_SWIZZLE     0
-#endif
-
-static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs)
-{
-       int nr_longs;
-
-       nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
-
-       b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL);
-       if (!b->private)
-               return -ENOMEM;
-
-       b->shared = b->private + nr_cpus;
-
-       return 0;
-}
-
-static void vgic_free_bitmap(struct vgic_bitmap *b)
-{
-       kfree(b->private);
-       b->private = NULL;
-       b->shared = NULL;
-}
-
-/*
- * Call this function to convert a u64 value to an unsigned long * bitmask
- * in a way that works on both 32-bit and 64-bit LE and BE platforms.
- *
- * Warning: Calling this function may modify *val.
- */
-static unsigned long *u64_to_bitmask(u64 *val)
-{
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
-       *val = (*val >> 32) | (*val << 32);
-#endif
-       return (unsigned long *)val;
-}
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset)
-{
-       offset >>= 2;
-       if (!offset)
-               return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE;
-       else
-               return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE);
-}
-
-static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
-                                  int cpuid, int irq)
-{
-       if (irq < VGIC_NR_PRIVATE_IRQS)
-               return test_bit(irq, x->private + cpuid);
-
-       return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared);
-}
-
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
-                            int irq, int val)
-{
-       unsigned long *reg;
-
-       if (irq < VGIC_NR_PRIVATE_IRQS) {
-               reg = x->private + cpuid;
-       } else {
-               reg = x->shared;
-               irq -= VGIC_NR_PRIVATE_IRQS;
-       }
-
-       if (val)
-               set_bit(irq, reg);
-       else
-               clear_bit(irq, reg);
-}
-
-static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
-{
-       return x->private + cpuid;
-}
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
-{
-       return x->shared;
-}
-
-static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs)
-{
-       int size;
-
-       size  = nr_cpus * VGIC_NR_PRIVATE_IRQS;
-       size += nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
-       x->private = kzalloc(size, GFP_KERNEL);
-       if (!x->private)
-               return -ENOMEM;
-
-       x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32);
-       return 0;
-}
-
-static void vgic_free_bytemap(struct vgic_bytemap *b)
-{
-       kfree(b->private);
-       b->private = NULL;
-       b->shared = NULL;
-}
-
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
-{
-       u32 *reg;
-
-       if (offset < VGIC_NR_PRIVATE_IRQS) {
-               reg = x->private;
-               offset += cpuid * VGIC_NR_PRIVATE_IRQS;
-       } else {
-               reg = x->shared;
-               offset -= VGIC_NR_PRIVATE_IRQS;
-       }
-
-       return reg + (offset / sizeof(u32));
-}
-
-#define VGIC_CFG_LEVEL 0
-#define VGIC_CFG_EDGE  1
-
-static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       int irq_val;
-
-       irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq);
-       return irq_val == VGIC_CFG_EDGE;
-}
-
-static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
-}
-
-static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
-       if (!vgic_dist_irq_get_level(vcpu, irq)) {
-               vgic_dist_irq_clear_pending(vcpu, irq);
-               if (!compute_pending_for_cpu(vcpu))
-                       clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-       }
-}
-
-static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq);
-}
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1);
-}
-
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
-{
-       if (irq < VGIC_NR_PRIVATE_IRQS)
-               set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
-       else
-               set_bit(irq - VGIC_NR_PRIVATE_IRQS,
-                       vcpu->arch.vgic_cpu.pending_shared);
-}
-
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
-{
-       if (irq < VGIC_NR_PRIVATE_IRQS)
-               clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
-       else
-               clear_bit(irq - VGIC_NR_PRIVATE_IRQS,
-                         vcpu->arch.vgic_cpu.pending_shared);
-}
-
-static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
-{
-       return !vgic_irq_is_queued(vcpu, irq);
-}
-
-/**
- * vgic_reg_access - access vgic register
- * @mmio:   pointer to the data describing the mmio access
- * @reg:    pointer to the virtual backing of vgic distributor data
- * @offset: least significant 2 bits used for word offset
- * @mode:   ACCESS_ mode (see defines above)
- *
- * Helper to make vgic register access easier using one of the access
- * modes defined for vgic register access
- * (read,raz,write-ignored,setbit,clearbit,write)
- */
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
-                    phys_addr_t offset, int mode)
-{
-       int word_offset = (offset & 3) * 8;
-       u32 mask = (1UL << (mmio->len * 8)) - 1;
-       u32 regval;
-
-       /*
-        * Any alignment fault should have been delivered to the guest
-        * directly (ARM ARM B3.12.7 "Prioritization of aborts").
-        */
-
-       if (reg) {
-               regval = *reg;
-       } else {
-               BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED));
-               regval = 0;
-       }
-
-       if (mmio->is_write) {
-               u32 data = mmio_data_read(mmio, mask) << word_offset;
-               switch (ACCESS_WRITE_MASK(mode)) {
-               case ACCESS_WRITE_IGNORED:
-                       return;
-
-               case ACCESS_WRITE_SETBIT:
-                       regval |= data;
-                       break;
-
-               case ACCESS_WRITE_CLEARBIT:
-                       regval &= ~data;
-                       break;
-
-               case ACCESS_WRITE_VALUE:
-                       regval = (regval & ~(mask << word_offset)) | data;
-                       break;
-               }
-               *reg = regval;
-       } else {
-               switch (ACCESS_READ_MASK(mode)) {
-               case ACCESS_READ_RAZ:
-                       regval = 0;
-                       /* fall through */
-
-               case ACCESS_READ_VALUE:
-                       mmio_data_write(mmio, mask, regval >> word_offset);
-               }
-       }
-}
-
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-                       phys_addr_t offset)
-{
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                           phys_addr_t offset, int vcpu_id, int access)
-{
-       u32 *reg;
-       int mode = ACCESS_READ_VALUE | access;
-       struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id);
-
-       reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset, mode);
-       if (mmio->is_write) {
-               if (access & ACCESS_WRITE_CLEARBIT) {
-                       if (offset < 4) /* Force SGI enabled */
-                               *reg |= 0xffff;
-                       vgic_retire_disabled_irqs(target_vcpu);
-               }
-               vgic_update_state(kvm);
-               return true;
-       }
-
-       return false;
-}
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm,
-                                struct kvm_exit_mmio *mmio,
-                                phys_addr_t offset, int vcpu_id)
-{
-       u32 *reg, orig;
-       u32 level_mask;
-       int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset);
-       level_mask = (~(*reg));
-
-       /* Mark both level and edge triggered irqs as pending */
-       reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-       orig = *reg;
-       vgic_reg_access(mmio, reg, offset, mode);
-
-       if (mmio->is_write) {
-               /* Set the soft-pending flag only for level-triggered irqs */
-               reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
-                                         vcpu_id, offset);
-               vgic_reg_access(mmio, reg, offset, mode);
-               *reg &= level_mask;
-
-               /* Ignore writes to SGIs */
-               if (offset < 2) {
-                       *reg &= ~0xffff;
-                       *reg |= orig & 0xffff;
-               }
-
-               vgic_update_state(kvm);
-               return true;
-       }
-
-       return false;
-}
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm,
-                                  struct kvm_exit_mmio *mmio,
-                                  phys_addr_t offset, int vcpu_id)
-{
-       u32 *level_active;
-       u32 *reg, orig;
-       int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-       orig = *reg;
-       vgic_reg_access(mmio, reg, offset, mode);
-       if (mmio->is_write) {
-               /* Re-set level triggered level-active interrupts */
-               level_active = vgic_bitmap_get_reg(&dist->irq_level,
-                                         vcpu_id, offset);
-               reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-               *reg |= *level_active;
-
-               /* Ignore writes to SGIs */
-               if (offset < 2) {
-                       *reg &= ~0xffff;
-                       *reg |= orig & 0xffff;
-               }
-
-               /* Clear soft-pending flags */
-               reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
-                                         vcpu_id, offset);
-               vgic_reg_access(mmio, reg, offset, mode);
-
-               vgic_update_state(kvm);
-               return true;
-       }
-       return false;
-}
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
-                               struct kvm_exit_mmio *mmio,
-                               phys_addr_t offset, int vcpu_id)
-{
-       u32 *reg;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
-
-       if (mmio->is_write) {
-               vgic_update_state(kvm);
-               return true;
-       }
-
-       return false;
-}
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset, int vcpu_id)
-{
-       u32 *reg;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
-
-       if (mmio->is_write) {
-               vgic_update_state(kvm);
-               return true;
-       }
-
-       return false;
-}
-
-static u32 vgic_cfg_expand(u16 val)
-{
-       u32 res = 0;
-       int i;
-
-       /*
-        * Turn a 16bit value like abcd...mnop into a 32bit word
-        * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is.
-        */
-       for (i = 0; i < 16; i++)
-               res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1);
-
-       return res;
-}
-
-static u16 vgic_cfg_compress(u32 val)
-{
-       u16 res = 0;
-       int i;
-
-       /*
-        * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like
-        * abcd...mnop which is what we really care about.
-        */
-       for (i = 0; i < 16; i++)
-               res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i;
-
-       return res;
-}
-
-/*
- * The distributor uses 2 bits per IRQ for the CFG register, but the
- * LSB is always 0. As such, we only keep the upper bit, and use the
- * two above functions to compress/expand the bits
- */
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
-                        phys_addr_t offset)
-{
-       u32 val;
-
-       if (offset & 4)
-               val = *reg >> 16;
-       else
-               val = *reg & 0xffff;
-
-       val = vgic_cfg_expand(val);
-       vgic_reg_access(mmio, &val, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       if (mmio->is_write) {
-               /* Ignore writes to read-only SGI and PPI bits */
-               if (offset < 8)
-                       return false;
-
-               val = vgic_cfg_compress(val);
-               if (offset & 4) {
-                       *reg &= 0xffff;
-                       *reg |= val << 16;
-               } else {
-                       *reg &= 0xffff << 16;
-                       *reg |= val;
-               }
-       }
-
-       return false;
-}
-
-/**
- * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor
- * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
- *
- * Move any IRQs that have already been assigned to LRs back to the
- * emulated distributor state so that the complete emulated state can be read
- * from the main emulation structures without investigating the LRs.
- */
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
-{
-       u64 elrsr = vgic_get_elrsr(vcpu);
-       unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-       int i;
-
-       for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) {
-               struct vgic_lr lr = vgic_get_lr(vcpu, i);
-
-               /*
-                * There are three options for the state bits:
-                *
-                * 01: pending
-                * 10: active
-                * 11: pending and active
-                */
-               BUG_ON(!(lr.state & LR_STATE_MASK));
-
-               /* Reestablish SGI source for pending and active IRQs */
-               if (lr.irq < VGIC_NR_SGIS)
-                       add_sgi_source(vcpu, lr.irq, lr.source);
-
-               /*
-                * If the LR holds an active (10) or a pending and active (11)
-                * interrupt then move the active state to the
-                * distributor tracking bit.
-                */
-               if (lr.state & LR_STATE_ACTIVE)
-                       vgic_irq_set_active(vcpu, lr.irq);
-
-               /*
-                * Reestablish the pending state on the distributor and the
-                * CPU interface and mark the LR as free for other use.
-                */
-               vgic_retire_lr(i, vcpu);
-
-               /* Finally update the VGIC state. */
-               vgic_update_state(vcpu->kvm);
-       }
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
-                                     int len, gpa_t offset)
-{
-       while (ranges->len) {
-               if (offset >= ranges->base &&
-                   (offset + len) <= (ranges->base + ranges->len))
-                       return ranges;
-               ranges++;
-       }
-
-       return NULL;
-}
-
-static bool vgic_validate_access(const struct vgic_dist *dist,
-                                const struct vgic_io_range *range,
-                                unsigned long offset)
-{
-       int irq;
-
-       if (!range->bits_per_irq)
-               return true;    /* Not an irq-based access */
-
-       irq = offset * 8 / range->bits_per_irq;
-       if (irq >= dist->nr_irqs)
-               return false;
-
-       return true;
-}
-
-/*
- * Call the respective handler function for the given range.
- * We split up any 64 bit accesses into two consecutive 32 bit
- * handler calls and merge the result afterwards.
- * We do this in a little endian fashion regardless of the host's
- * or guest's endianness, because the GIC is always LE and the rest of
- * the code (vgic_reg_access) also puts it in a LE fashion already.
- * At this point we have already identified the handle function, so
- * range points to that one entry and offset is relative to this.
- */
-static bool call_range_handler(struct kvm_vcpu *vcpu,
-                              struct kvm_exit_mmio *mmio,
-                              unsigned long offset,
-                              const struct vgic_io_range *range)
-{
-       struct kvm_exit_mmio mmio32;
-       bool ret;
-
-       if (likely(mmio->len <= 4))
-               return range->handle_mmio(vcpu, mmio, offset);
-
-       /*
-        * Any access bigger than 4 bytes (that we currently handle in KVM)
-        * is actually 8 bytes long, caused by a 64-bit access
-        */
-
-       mmio32.len = 4;
-       mmio32.is_write = mmio->is_write;
-       mmio32.private = mmio->private;
-
-       mmio32.phys_addr = mmio->phys_addr + 4;
-       mmio32.data = &((u32 *)mmio->data)[1];
-       ret = range->handle_mmio(vcpu, &mmio32, offset + 4);
-
-       mmio32.phys_addr = mmio->phys_addr;
-       mmio32.data = &((u32 *)mmio->data)[0];
-       ret |= range->handle_mmio(vcpu, &mmio32, offset);
-
-       return ret;
-}
-
-/**
- * vgic_handle_mmio_access - handle an in-kernel MMIO access
- * This is called by the read/write KVM IO device wrappers below.
- * @vcpu:      pointer to the vcpu performing the access
- * @this:      pointer to the KVM IO device in charge
- * @addr:      guest physical address of the access
- * @len:       size of the access
- * @val:       pointer to the data region
- * @is_write:  read or write access
- *
- * returns true if the MMIO access could be performed
- */
-static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
-                                  struct kvm_io_device *this, gpa_t addr,
-                                  int len, void *val, bool is_write)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct vgic_io_device *iodev = container_of(this,
-                                                   struct vgic_io_device, dev);
-       const struct vgic_io_range *range;
-       struct kvm_exit_mmio mmio;
-       bool updated_state;
-       gpa_t offset;
-
-       offset = addr - iodev->addr;
-       range = vgic_find_range(iodev->reg_ranges, len, offset);
-       if (unlikely(!range || !range->handle_mmio)) {
-               pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len);
-               return -ENXIO;
-       }
-
-       mmio.phys_addr = addr;
-       mmio.len = len;
-       mmio.is_write = is_write;
-       mmio.data = val;
-       mmio.private = iodev->redist_vcpu;
-
-       spin_lock(&dist->lock);
-       offset -= range->base;
-       if (vgic_validate_access(dist, range, offset)) {
-               updated_state = call_range_handler(vcpu, &mmio, offset, range);
-       } else {
-               if (!is_write)
-                       memset(val, 0, len);
-               updated_state = false;
-       }
-       spin_unlock(&dist->lock);
-
-       if (updated_state)
-               vgic_kick_vcpus(vcpu->kvm);
-
-       return 0;
-}
-
-static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu,
-                                struct kvm_io_device *this,
-                                gpa_t addr, int len, void *val)
-{
-       return vgic_handle_mmio_access(vcpu, this, addr, len, val, false);
-}
-
-static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu,
-                                 struct kvm_io_device *this,
-                                 gpa_t addr, int len, const void *val)
-{
-       return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val,
-                                      true);
-}
-
-static struct kvm_io_device_ops vgic_io_ops = {
-       .read   = vgic_handle_mmio_read,
-       .write  = vgic_handle_mmio_write,
-};
-
-/**
- * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus
- * @kvm:            The VM structure pointer
- * @base:           The (guest) base address for the register frame
- * @len:            Length of the register frame window
- * @ranges:         Describing the handler functions for each register
- * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call
- * @iodev:          Points to memory to be passed on to the handler
- *
- * @iodev stores the parameters of this function to be usable by the handler
- * respectively the dispatcher function (since the KVM I/O bus framework lacks
- * an opaque parameter). Initialization is done in this function, but the
- * reference should be valid and unique for the whole VGIC lifetime.
- * If the register frame is not mapped for a specific VCPU, pass -1 to
- * @redist_vcpu_id.
- */
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
-                            const struct vgic_io_range *ranges,
-                            int redist_vcpu_id,
-                            struct vgic_io_device *iodev)
-{
-       struct kvm_vcpu *vcpu = NULL;
-       int ret;
-
-       if (redist_vcpu_id >= 0)
-               vcpu = kvm_get_vcpu(kvm, redist_vcpu_id);
-
-       iodev->addr             = base;
-       iodev->len              = len;
-       iodev->reg_ranges       = ranges;
-       iodev->redist_vcpu      = vcpu;
-
-       kvm_iodevice_init(&iodev->dev, &vgic_io_ops);
-
-       mutex_lock(&kvm->slots_lock);
-
-       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len,
-                                     &iodev->dev);
-       mutex_unlock(&kvm->slots_lock);
-
-       /* Mark the iodev as invalid if registration fails. */
-       if (ret)
-               iodev->dev.ops = NULL;
-
-       return ret;
-}
-
-static int vgic_nr_shared_irqs(struct vgic_dist *dist)
-{
-       return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-}
-
-static int compute_active_for_cpu(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long *active, *enabled, *act_percpu, *act_shared;
-       unsigned long active_private, active_shared;
-       int nr_shared = vgic_nr_shared_irqs(dist);
-       int vcpu_id;
-
-       vcpu_id = vcpu->vcpu_id;
-       act_percpu = vcpu->arch.vgic_cpu.active_percpu;
-       act_shared = vcpu->arch.vgic_cpu.active_shared;
-
-       active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id);
-       enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
-       bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS);
-
-       active = vgic_bitmap_get_shared_map(&dist->irq_active);
-       enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
-       bitmap_and(act_shared, active, enabled, nr_shared);
-       bitmap_and(act_shared, act_shared,
-                  vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
-                  nr_shared);
-
-       active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS);
-       active_shared = find_first_bit(act_shared, nr_shared);
-
-       return (active_private < VGIC_NR_PRIVATE_IRQS ||
-               active_shared < nr_shared);
-}
-
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
-       unsigned long pending_private, pending_shared;
-       int nr_shared = vgic_nr_shared_irqs(dist);
-       int vcpu_id;
-
-       vcpu_id = vcpu->vcpu_id;
-       pend_percpu = vcpu->arch.vgic_cpu.pending_percpu;
-       pend_shared = vcpu->arch.vgic_cpu.pending_shared;
-
-       if (!dist->enabled) {
-               bitmap_zero(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-               bitmap_zero(pend_shared, nr_shared);
-               return 0;
-       }
-
-       pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
-       enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
-       bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
-
-       pending = vgic_bitmap_get_shared_map(&dist->irq_pending);
-       enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
-       bitmap_and(pend_shared, pending, enabled, nr_shared);
-       bitmap_and(pend_shared, pend_shared,
-                  vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
-                  nr_shared);
-
-       pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-       pending_shared = find_first_bit(pend_shared, nr_shared);
-       return (pending_private < VGIC_NR_PRIVATE_IRQS ||
-               pending_shared < vgic_nr_shared_irqs(dist));
-}
-
-/*
- * Update the interrupt state and determine which CPUs have pending
- * or active interrupts. Must be called with distributor lock held.
- */
-void vgic_update_state(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int c;
-
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               if (compute_pending_for_cpu(vcpu))
-                       set_bit(c, dist->irq_pending_on_cpu);
-
-               if (compute_active_for_cpu(vcpu))
-                       set_bit(c, dist->irq_active_on_cpu);
-               else
-                       clear_bit(c, dist->irq_active_on_cpu);
-       }
-}
-
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-       return vgic_ops->get_lr(vcpu, lr);
-}
-
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
-                              struct vgic_lr vlr)
-{
-       vgic_ops->set_lr(vcpu, lr, vlr);
-}
-
-static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
-{
-       return vgic_ops->get_elrsr(vcpu);
-}
-
-static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu)
-{
-       return vgic_ops->get_eisr(vcpu);
-}
-
-static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu)
-{
-       vgic_ops->clear_eisr(vcpu);
-}
-
-static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu)
-{
-       return vgic_ops->get_interrupt_status(vcpu);
-}
-
-static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu)
-{
-       vgic_ops->enable_underflow(vcpu);
-}
-
-static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu)
-{
-       vgic_ops->disable_underflow(vcpu);
-}
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-       vgic_ops->get_vmcr(vcpu, vmcr);
-}
-
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-       vgic_ops->set_vmcr(vcpu, vmcr);
-}
-
-static inline void vgic_enable(struct kvm_vcpu *vcpu)
-{
-       vgic_ops->enable(vcpu);
-}
-
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
-{
-       struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
-
-       vgic_irq_clear_queued(vcpu, vlr.irq);
-
-       /*
-        * We must transfer the pending state back to the distributor before
-        * retiring the LR, otherwise we may loose edge-triggered interrupts.
-        */
-       if (vlr.state & LR_STATE_PENDING) {
-               vgic_dist_irq_set_pending(vcpu, vlr.irq);
-               vlr.hwirq = 0;
-       }
-
-       vlr.state = 0;
-       vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-static bool dist_active_irq(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
-}
-
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
-       int i;
-
-       for (i = 0; i < vgic->nr_lr; i++) {
-               struct vgic_lr vlr = vgic_get_lr(vcpu, i);
-
-               if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE)
-                       return true;
-       }
-
-       return vgic_irq_is_active(vcpu, virt_irq);
-}
-
-/*
- * An interrupt may have been disabled after being made pending on the
- * CPU interface (the classic case is a timer running while we're
- * rebooting the guest - the interrupt would kick as soon as the CPU
- * interface gets enabled, with deadly consequences).
- *
- * The solution is to examine already active LRs, and check the
- * interrupt is still enabled. If not, just retire it.
- */
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
-{
-       u64 elrsr = vgic_get_elrsr(vcpu);
-       unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-       int lr;
-
-       for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
-               struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-               if (!vgic_irq_is_enabled(vcpu, vlr.irq))
-                       vgic_retire_lr(lr, vcpu);
-       }
-}
-
-static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
-                                int lr_nr, struct vgic_lr vlr)
-{
-       if (vgic_irq_is_active(vcpu, irq)) {
-               vlr.state |= LR_STATE_ACTIVE;
-               kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state);
-               vgic_irq_clear_active(vcpu, irq);
-               vgic_update_state(vcpu->kvm);
-       } else {
-               WARN_ON(!vgic_dist_irq_is_pending(vcpu, irq));
-               vlr.state |= LR_STATE_PENDING;
-               kvm_debug("Set pending: 0x%x\n", vlr.state);
-       }
-
-       if (!vgic_irq_is_edge(vcpu, irq))
-               vlr.state |= LR_EOI_INT;
-
-       if (vlr.irq >= VGIC_NR_SGIS) {
-               struct irq_phys_map *map;
-               map = vgic_irq_map_search(vcpu, irq);
-
-               if (map) {
-                       vlr.hwirq = map->phys_irq;
-                       vlr.state |= LR_HW;
-                       vlr.state &= ~LR_EOI_INT;
-
-                       /*
-                        * Make sure we're not going to sample this
-                        * again, as a HW-backed interrupt cannot be
-                        * in the PENDING_ACTIVE stage.
-                        */
-                       vgic_irq_set_queued(vcpu, irq);
-               }
-       }
-
-       vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-/*
- * Queue an interrupt to a CPU virtual interface. Return true on success,
- * or false if it wasn't possible to queue it.
- * sgi_source must be zero for any non-SGI interrupts.
- */
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       u64 elrsr = vgic_get_elrsr(vcpu);
-       unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-       struct vgic_lr vlr;
-       int lr;
-
-       /* Sanitize the input... */
-       BUG_ON(sgi_source_id & ~7);
-       BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
-       BUG_ON(irq >= dist->nr_irqs);
-
-       kvm_debug("Queue IRQ%d\n", irq);
-
-       /* Do we have an active interrupt for the same CPUID? */
-       for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
-               vlr = vgic_get_lr(vcpu, lr);
-               if (vlr.irq == irq && vlr.source == sgi_source_id) {
-                       kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
-                       vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-                       return true;
-               }
-       }
-
-       /* Try to use another LR for this interrupt */
-       lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
-       if (lr >= vgic->nr_lr)
-               return false;
-
-       kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
-
-       vlr.irq = irq;
-       vlr.source = sgi_source_id;
-       vlr.state = 0;
-       vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-
-       return true;
-}
-
-static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
-{
-       if (!vgic_can_sample_irq(vcpu, irq))
-               return true; /* level interrupt, already queued */
-
-       if (vgic_queue_irq(vcpu, 0, irq)) {
-               if (vgic_irq_is_edge(vcpu, irq)) {
-                       vgic_dist_irq_clear_pending(vcpu, irq);
-                       vgic_cpu_irq_clear(vcpu, irq);
-               } else {
-                       vgic_irq_set_queued(vcpu, irq);
-               }
-
-               return true;
-       }
-
-       return false;
-}
-
-/*
- * Fill the list registers with pending interrupts before running the
- * guest.
- */
-static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long *pa_percpu, *pa_shared;
-       int i, vcpu_id;
-       int overflow = 0;
-       int nr_shared = vgic_nr_shared_irqs(dist);
-
-       vcpu_id = vcpu->vcpu_id;
-
-       pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu;
-       pa_shared = vcpu->arch.vgic_cpu.pend_act_shared;
-
-       bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu,
-                 VGIC_NR_PRIVATE_IRQS);
-       bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared,
-                 nr_shared);
-       /*
-        * We may not have any pending interrupt, or the interrupts
-        * may have been serviced from another vcpu. In all cases,
-        * move along.
-        */
-       if (!kvm_vgic_vcpu_pending_irq(vcpu) && !dist_active_irq(vcpu))
-               goto epilog;
-
-       /* SGIs */
-       for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) {
-               if (!queue_sgi(vcpu, i))
-                       overflow = 1;
-       }
-
-       /* PPIs */
-       for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) {
-               if (!vgic_queue_hwirq(vcpu, i))
-                       overflow = 1;
-       }
-
-       /* SPIs */
-       for_each_set_bit(i, pa_shared, nr_shared) {
-               if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
-                       overflow = 1;
-       }
-
-
-
-
-epilog:
-       if (overflow) {
-               vgic_enable_underflow(vcpu);
-       } else {
-               vgic_disable_underflow(vcpu);
-               /*
-                * We're about to run this VCPU, and we've consumed
-                * everything the distributor had in store for
-                * us. Claim we don't have anything pending. We'll
-                * adjust that if needed while exiting.
-                */
-               clear_bit(vcpu_id, dist->irq_pending_on_cpu);
-       }
-}
-
-static int process_queued_irq(struct kvm_vcpu *vcpu,
-                                  int lr, struct vgic_lr vlr)
-{
-       int pending = 0;
-
-       /*
-        * If the IRQ was EOIed (called from vgic_process_maintenance) or it
-        * went from active to non-active (called from vgic_sync_hwirq) it was
-        * also ACKed and we we therefore assume we can clear the soft pending
-        * state (should it had been set) for this interrupt.
-        *
-        * Note: if the IRQ soft pending state was set after the IRQ was
-        * acked, it actually shouldn't be cleared, but we have no way of
-        * knowing that unless we start trapping ACKs when the soft-pending
-        * state is set.
-        */
-       vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
-
-       /*
-        * Tell the gic to start sampling this interrupt again.
-        */
-       vgic_irq_clear_queued(vcpu, vlr.irq);
-
-       /* Any additional pending interrupt? */
-       if (vgic_irq_is_edge(vcpu, vlr.irq)) {
-               BUG_ON(!(vlr.state & LR_HW));
-               pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
-       } else {
-               if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
-                       vgic_cpu_irq_set(vcpu, vlr.irq);
-                       pending = 1;
-               } else {
-                       vgic_dist_irq_clear_pending(vcpu, vlr.irq);
-                       vgic_cpu_irq_clear(vcpu, vlr.irq);
-               }
-       }
-
-       /*
-        * Despite being EOIed, the LR may not have
-        * been marked as empty.
-        */
-       vlr.state = 0;
-       vlr.hwirq = 0;
-       vgic_set_lr(vcpu, lr, vlr);
-
-       return pending;
-}
-
-static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
-{
-       u32 status = vgic_get_interrupt_status(vcpu);
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct kvm *kvm = vcpu->kvm;
-       int level_pending = 0;
-
-       kvm_debug("STATUS = %08x\n", status);
-
-       if (status & INT_STATUS_EOI) {
-               /*
-                * Some level interrupts have been EOIed. Clear their
-                * active bit.
-                */
-               u64 eisr = vgic_get_eisr(vcpu);
-               unsigned long *eisr_ptr = u64_to_bitmask(&eisr);
-               int lr;
-
-               for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
-                       struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-                       WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
-                       WARN_ON(vlr.state & LR_STATE_MASK);
-
-
-                       /*
-                        * kvm_notify_acked_irq calls kvm_set_irq()
-                        * to reset the IRQ level, which grabs the dist->lock
-                        * so we call this before taking the dist->lock.
-                        */
-                       kvm_notify_acked_irq(kvm, 0,
-                                            vlr.irq - VGIC_NR_PRIVATE_IRQS);
-
-                       spin_lock(&dist->lock);
-                       level_pending |= process_queued_irq(vcpu, lr, vlr);
-                       spin_unlock(&dist->lock);
-               }
-       }
-
-       if (status & INT_STATUS_UNDERFLOW)
-               vgic_disable_underflow(vcpu);
-
-       /*
-        * In the next iterations of the vcpu loop, if we sync the vgic state
-        * after flushing it, but before entering the guest (this happens for
-        * pending signals and vmid rollovers), then make sure we don't pick
-        * up any old maintenance interrupts here.
-        */
-       vgic_clear_eisr(vcpu);
-
-       return level_pending;
-}
-
-/*
- * Save the physical active state, and reset it to inactive.
- *
- * Return true if there's a pending forwarded interrupt to queue.
- */
-static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       bool level_pending;
-
-       if (!(vlr.state & LR_HW))
-               return false;
-
-       if (vlr.state & LR_STATE_ACTIVE)
-               return false;
-
-       spin_lock(&dist->lock);
-       level_pending = process_queued_irq(vcpu, lr, vlr);
-       spin_unlock(&dist->lock);
-       return level_pending;
-}
-
-/* Sync back the VGIC state after a guest run */
-static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       u64 elrsr;
-       unsigned long *elrsr_ptr;
-       int lr, pending;
-       bool level_pending;
-
-       level_pending = vgic_process_maintenance(vcpu);
-
-       /* Deal with HW interrupts, and clear mappings for empty LRs */
-       for (lr = 0; lr < vgic->nr_lr; lr++) {
-               struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-               level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
-               BUG_ON(vlr.irq >= dist->nr_irqs);
-       }
-
-       /* Check if we still have something up our sleeve... */
-       elrsr = vgic_get_elrsr(vcpu);
-       elrsr_ptr = u64_to_bitmask(&elrsr);
-       pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
-       if (level_pending || pending < vgic->nr_lr)
-               set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return;
-
-       spin_lock(&dist->lock);
-       __kvm_vgic_flush_hwstate(vcpu);
-       spin_unlock(&dist->lock);
-}
-
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return;
-
-       __kvm_vgic_sync_hwstate(vcpu);
-}
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return 0;
-
-       return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void vgic_kick_vcpus(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       int c;
-
-       /*
-        * We've injected an interrupt, time to find out who deserves
-        * a good kick...
-        */
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               if (kvm_vgic_vcpu_pending_irq(vcpu))
-                       kvm_vcpu_kick(vcpu);
-       }
-}
-
-static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
-{
-       int edge_triggered = vgic_irq_is_edge(vcpu, irq);
-
-       /*
-        * Only inject an interrupt if:
-        * - edge triggered and we have a rising edge
-        * - level triggered and we change level
-        */
-       if (edge_triggered) {
-               int state = vgic_dist_irq_is_pending(vcpu, irq);
-               return level > state;
-       } else {
-               int state = vgic_dist_irq_get_level(vcpu, irq);
-               return level != state;
-       }
-}
-
-static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-                                  unsigned int irq_num, bool level)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int edge_triggered, level_triggered;
-       int enabled;
-       bool ret = true, can_inject = true;
-
-       trace_vgic_update_irq_pending(cpuid, irq_num, level);
-
-       if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
-               return -EINVAL;
-
-       spin_lock(&dist->lock);
-
-       vcpu = kvm_get_vcpu(kvm, cpuid);
-       edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
-       level_triggered = !edge_triggered;
-
-       if (!vgic_validate_injection(vcpu, irq_num, level)) {
-               ret = false;
-               goto out;
-       }
-
-       if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
-               cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
-               if (cpuid == VCPU_NOT_ALLOCATED) {
-                       /* Pretend we use CPU0, and prevent injection */
-                       cpuid = 0;
-                       can_inject = false;
-               }
-               vcpu = kvm_get_vcpu(kvm, cpuid);
-       }
-
-       kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid);
-
-       if (level) {
-               if (level_triggered)
-                       vgic_dist_irq_set_level(vcpu, irq_num);
-               vgic_dist_irq_set_pending(vcpu, irq_num);
-       } else {
-               if (level_triggered) {
-                       vgic_dist_irq_clear_level(vcpu, irq_num);
-                       if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) {
-                               vgic_dist_irq_clear_pending(vcpu, irq_num);
-                               vgic_cpu_irq_clear(vcpu, irq_num);
-                               if (!compute_pending_for_cpu(vcpu))
-                                       clear_bit(cpuid, dist->irq_pending_on_cpu);
-                       }
-               }
-
-               ret = false;
-               goto out;
-       }
-
-       enabled = vgic_irq_is_enabled(vcpu, irq_num);
-
-       if (!enabled || !can_inject) {
-               ret = false;
-               goto out;
-       }
-
-       if (!vgic_can_sample_irq(vcpu, irq_num)) {
-               /*
-                * Level interrupt in progress, will be picked up
-                * when EOId.
-                */
-               ret = false;
-               goto out;
-       }
-
-       if (level) {
-               vgic_cpu_irq_set(vcpu, irq_num);
-               set_bit(cpuid, dist->irq_pending_on_cpu);
-       }
-
-out:
-       spin_unlock(&dist->lock);
-
-       if (ret) {
-               /* kick the specified vcpu */
-               kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
-       }
-
-       return 0;
-}
-
-static int vgic_lazy_init(struct kvm *kvm)
-{
-       int ret = 0;
-
-       if (unlikely(!vgic_initialized(kvm))) {
-               /*
-                * We only provide the automatic initialization of the VGIC
-                * for the legacy case of a GICv2. Any other type must
-                * be explicitly initialized once setup with the respective
-                * KVM device call.
-                */
-               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
-                       return -EBUSY;
-
-               mutex_lock(&kvm->lock);
-               ret = vgic_init(kvm);
-               mutex_unlock(&kvm->lock);
-       }
-
-       return ret;
-}
-
-/**
- * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device. This IRQ
- *           must not be mapped to a HW interrupt.
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *                           false: to ignore the call
- *          Level-sensitive  true:  raise the input signal
- *                           false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
-                       bool level)
-{
-       struct irq_phys_map *map;
-       int ret;
-
-       ret = vgic_lazy_init(kvm);
-       if (ret)
-               return ret;
-
-       map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
-       if (map)
-               return -EINVAL;
-
-       return vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-}
-
-/**
- * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @virt_irq: The virtual IRQ to be injected
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *                           false: to ignore the call
- *          Level-sensitive  true:  raise the input signal
- *                           false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-                              unsigned int virt_irq, bool level)
-{
-       int ret;
-
-       ret = vgic_lazy_init(kvm);
-       if (ret)
-               return ret;
-
-       return vgic_update_irq_pending(kvm, cpuid, virt_irq, level);
-}
-
-static irqreturn_t vgic_maintenance_handler(int irq, void *data)
-{
-       /*
-        * We cannot rely on the vgic maintenance interrupt to be
-        * delivered synchronously. This means we can only use it to
-        * exit the VM, and we perform the handling of EOIed
-        * interrupts on the exit path (see vgic_process_maintenance).
-        */
-       return IRQ_HANDLED;
-}
-
-static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
-                                                   int virt_irq)
-{
-       if (virt_irq < VGIC_NR_PRIVATE_IRQS)
-               return &vcpu->arch.vgic_cpu.irq_phys_map_list;
-       else
-               return &vcpu->kvm->arch.vgic.irq_phys_map_list;
-}
-
-/**
- * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number for the guest
- * @phys_irq: The hardware IRQ number of the host
- *
- * Establish a mapping between a guest visible irq (@virt_irq) and a
- * hardware irq (@phys_irq). On injection, @virt_irq will be associated with
- * the physical interrupt represented by @phys_irq. This mapping can be
- * established multiple times as long as the parameters are the same.
- *
- * Returns 0 on success or an error value otherwise.
- */
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-       struct irq_phys_map *map;
-       struct irq_phys_map_entry *entry;
-       int ret = 0;
-
-       /* Create a new mapping */
-       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-       if (!entry)
-               return -ENOMEM;
-
-       spin_lock(&dist->irq_phys_map_lock);
-
-       /* Try to match an existing mapping */
-       map = vgic_irq_map_search(vcpu, virt_irq);
-       if (map) {
-               /* Make sure this mapping matches */
-               if (map->phys_irq != phys_irq)
-                       ret = -EINVAL;
-
-               /* Found an existing, valid mapping */
-               goto out;
-       }
-
-       map           = &entry->map;
-       map->virt_irq = virt_irq;
-       map->phys_irq = phys_irq;
-
-       list_add_tail_rcu(&entry->entry, root);
-
-out:
-       spin_unlock(&dist->irq_phys_map_lock);
-       /* If we've found a hit in the existing list, free the useless
-        * entry */
-       if (ret || map != &entry->map)
-               kfree(entry);
-       return ret;
-}
-
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
-                                               int virt_irq)
-{
-       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-       struct irq_phys_map_entry *entry;
-       struct irq_phys_map *map;
-
-       rcu_read_lock();
-
-       list_for_each_entry_rcu(entry, root, entry) {
-               map = &entry->map;
-               if (map->virt_irq == virt_irq) {
-                       rcu_read_unlock();
-                       return map;
-               }
-       }
-
-       rcu_read_unlock();
-
-       return NULL;
-}
-
-static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
-{
-       struct irq_phys_map_entry *entry;
-
-       entry = container_of(rcu, struct irq_phys_map_entry, rcu);
-       kfree(entry);
-}
-
-/**
- * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number to be unmapped
- *
- * Remove an existing mapping between virtual and physical interrupts.
- */
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct irq_phys_map_entry *entry;
-       struct list_head *root;
-
-       root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-
-       spin_lock(&dist->irq_phys_map_lock);
-
-       list_for_each_entry(entry, root, entry) {
-               if (entry->map.virt_irq == virt_irq) {
-                       list_del_rcu(&entry->entry);
-                       call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
-                       break;
-               }
-       }
-
-       spin_unlock(&dist->irq_phys_map_lock);
-
-       return 0;
-}
-
-static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct irq_phys_map_entry *entry;
-
-       spin_lock(&dist->irq_phys_map_lock);
-
-       list_for_each_entry(entry, root, entry) {
-               list_del_rcu(&entry->entry);
-               call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
-       }
-
-       spin_unlock(&dist->irq_phys_map_lock);
-}
-
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-       kfree(vgic_cpu->pending_shared);
-       kfree(vgic_cpu->active_shared);
-       kfree(vgic_cpu->pend_act_shared);
-       vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
-       vgic_cpu->pending_shared = NULL;
-       vgic_cpu->active_shared = NULL;
-       vgic_cpu->pend_act_shared = NULL;
-}
-
-static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       int nr_longs = BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
-       int sz = nr_longs * sizeof(unsigned long);
-       vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
-       vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
-       vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
-
-       if (!vgic_cpu->pending_shared
-               || !vgic_cpu->active_shared
-               || !vgic_cpu->pend_act_shared) {
-               kvm_vgic_vcpu_destroy(vcpu);
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-/**
- * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
-}
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-int kvm_vgic_get_max_vcpus(void)
-{
-       return vgic->max_gic_vcpus;
-}
-
-void kvm_vgic_destroy(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               kvm_vgic_vcpu_destroy(vcpu);
-
-       vgic_free_bitmap(&dist->irq_enabled);
-       vgic_free_bitmap(&dist->irq_level);
-       vgic_free_bitmap(&dist->irq_pending);
-       vgic_free_bitmap(&dist->irq_soft_pend);
-       vgic_free_bitmap(&dist->irq_queued);
-       vgic_free_bitmap(&dist->irq_cfg);
-       vgic_free_bytemap(&dist->irq_priority);
-       if (dist->irq_spi_target) {
-               for (i = 0; i < dist->nr_cpus; i++)
-                       vgic_free_bitmap(&dist->irq_spi_target[i]);
-       }
-       kfree(dist->irq_sgi_sources);
-       kfree(dist->irq_spi_cpu);
-       kfree(dist->irq_spi_mpidr);
-       kfree(dist->irq_spi_target);
-       kfree(dist->irq_pending_on_cpu);
-       kfree(dist->irq_active_on_cpu);
-       vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
-       dist->irq_sgi_sources = NULL;
-       dist->irq_spi_cpu = NULL;
-       dist->irq_spi_target = NULL;
-       dist->irq_pending_on_cpu = NULL;
-       dist->irq_active_on_cpu = NULL;
-       dist->nr_cpus = 0;
-}
-
-/*
- * Allocate and initialize the various data structures. Must be called
- * with kvm->lock held!
- */
-int vgic_init(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int nr_cpus, nr_irqs;
-       int ret, i, vcpu_id;
-
-       if (vgic_initialized(kvm))
-               return 0;
-
-       nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
-       if (!nr_cpus)           /* No vcpus? Can't be good... */
-               return -ENODEV;
-
-       /*
-        * If nobody configured the number of interrupts, use the
-        * legacy one.
-        */
-       if (!dist->nr_irqs)
-               dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
-
-       nr_irqs = dist->nr_irqs;
-
-       ret  = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs);
-       ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs);
-
-       if (ret)
-               goto out;
-
-       dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL);
-       dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL);
-       dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus,
-                                      GFP_KERNEL);
-       dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
-                                          GFP_KERNEL);
-       dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
-                                          GFP_KERNEL);
-       if (!dist->irq_sgi_sources ||
-           !dist->irq_spi_cpu ||
-           !dist->irq_spi_target ||
-           !dist->irq_pending_on_cpu ||
-           !dist->irq_active_on_cpu) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       for (i = 0; i < nr_cpus; i++)
-               ret |= vgic_init_bitmap(&dist->irq_spi_target[i],
-                                       nr_cpus, nr_irqs);
-
-       if (ret)
-               goto out;
-
-       ret = kvm->arch.vgic.vm_ops.init_model(kvm);
-       if (ret)
-               goto out;
-
-       kvm_for_each_vcpu(vcpu_id, vcpu, kvm) {
-               ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
-               if (ret) {
-                       kvm_err("VGIC: Failed to allocate vcpu memory\n");
-                       break;
-               }
-
-               /*
-                * Enable and configure all SGIs to be edge-triggere and
-                * configure all PPIs as level-triggered.
-                */
-               for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-                       if (i < VGIC_NR_SGIS) {
-                               /* SGIs */
-                               vgic_bitmap_set_irq_val(&dist->irq_enabled,
-                                                       vcpu->vcpu_id, i, 1);
-                               vgic_bitmap_set_irq_val(&dist->irq_cfg,
-                                                       vcpu->vcpu_id, i,
-                                                       VGIC_CFG_EDGE);
-                       } else if (i < VGIC_NR_PRIVATE_IRQS) {
-                               /* PPIs */
-                               vgic_bitmap_set_irq_val(&dist->irq_cfg,
-                                                       vcpu->vcpu_id, i,
-                                                       VGIC_CFG_LEVEL);
-                       }
-               }
-
-               vgic_enable(vcpu);
-       }
-
-out:
-       if (ret)
-               kvm_vgic_destroy(kvm);
-
-       return ret;
-}
-
-static int init_vgic_model(struct kvm *kvm, int type)
-{
-       switch (type) {
-       case KVM_DEV_TYPE_ARM_VGIC_V2:
-               vgic_v2_init_emulation(kvm);
-               break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-       case KVM_DEV_TYPE_ARM_VGIC_V3:
-               vgic_v3_init_emulation(kvm);
-               break;
-#endif
-       default:
-               return -ENODEV;
-       }
-
-       if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus)
-               return -E2BIG;
-
-       return 0;
-}
-
-/**
- * kvm_vgic_early_init - Earliest possible vgic initialization stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_early_init(struct kvm *kvm)
-{
-       spin_lock_init(&kvm->arch.vgic.lock);
-       spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
-       INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
-}
-
-int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
-       int i, vcpu_lock_idx = -1, ret;
-       struct kvm_vcpu *vcpu;
-
-       mutex_lock(&kvm->lock);
-
-       if (irqchip_in_kernel(kvm)) {
-               ret = -EEXIST;
-               goto out;
-       }
-
-       /*
-        * This function is also called by the KVM_CREATE_IRQCHIP handler,
-        * which had no chance yet to check the availability of the GICv2
-        * emulation. So check this here again. KVM_CREATE_DEVICE does
-        * the proper checks already.
-        */
-       if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) {
-               ret = -ENODEV;
-               goto out;
-       }
-
-       /*
-        * Any time a vcpu is run, vcpu_load is called which tries to grab the
-        * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
-        * that no other VCPUs are run while we create the vgic.
-        */
-       ret = -EBUSY;
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!mutex_trylock(&vcpu->mutex))
-                       goto out_unlock;
-               vcpu_lock_idx = i;
-       }
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (vcpu->arch.has_run_once)
-                       goto out_unlock;
-       }
-       ret = 0;
-
-       ret = init_vgic_model(kvm, type);
-       if (ret)
-               goto out_unlock;
-
-       kvm->arch.vgic.in_kernel = true;
-       kvm->arch.vgic.vgic_model = type;
-       kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
-       kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
-       kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
-       kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
-
-out_unlock:
-       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-               vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
-               mutex_unlock(&vcpu->mutex);
-       }
-
-out:
-       mutex_unlock(&kvm->lock);
-       return ret;
-}
-
-static int vgic_ioaddr_overlap(struct kvm *kvm)
-{
-       phys_addr_t dist = kvm->arch.vgic.vgic_dist_base;
-       phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base;
-
-       if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu))
-               return 0;
-       if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) ||
-           (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist))
-               return -EBUSY;
-       return 0;
-}
-
-static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
-                             phys_addr_t addr, phys_addr_t size)
-{
-       int ret;
-
-       if (addr & ~KVM_PHYS_MASK)
-               return -E2BIG;
-
-       if (addr & (SZ_4K - 1))
-               return -EINVAL;
-
-       if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
-               return -EEXIST;
-       if (addr + size < addr)
-               return -EINVAL;
-
-       *ioaddr = addr;
-       ret = vgic_ioaddr_overlap(kvm);
-       if (ret)
-               *ioaddr = VGIC_ADDR_UNDEF;
-
-       return ret;
-}
-
-/**
- * kvm_vgic_addr - set or get vgic VM base addresses
- * @kvm:   pointer to the vm struct
- * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr:  pointer to address value
- * @write: if true set the address in the VM address space, if false read the
- *          address
- *
- * Set or get the vgic base addresses for the distributor and the virtual CPU
- * interface in the VM physical address space.  These addresses are properties
- * of the emulated core/SoC and therefore user space initially knows this
- * information.
- */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
-       int r = 0;
-       struct vgic_dist *vgic = &kvm->arch.vgic;
-       int type_needed;
-       phys_addr_t *addr_ptr, block_size;
-       phys_addr_t alignment;
-
-       mutex_lock(&kvm->lock);
-       switch (type) {
-       case KVM_VGIC_V2_ADDR_TYPE_DIST:
-               type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
-               addr_ptr = &vgic->vgic_dist_base;
-               block_size = KVM_VGIC_V2_DIST_SIZE;
-               alignment = SZ_4K;
-               break;
-       case KVM_VGIC_V2_ADDR_TYPE_CPU:
-               type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
-               addr_ptr = &vgic->vgic_cpu_base;
-               block_size = KVM_VGIC_V2_CPU_SIZE;
-               alignment = SZ_4K;
-               break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-       case KVM_VGIC_V3_ADDR_TYPE_DIST:
-               type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
-               addr_ptr = &vgic->vgic_dist_base;
-               block_size = KVM_VGIC_V3_DIST_SIZE;
-               alignment = SZ_64K;
-               break;
-       case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-               type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
-               addr_ptr = &vgic->vgic_redist_base;
-               block_size = KVM_VGIC_V3_REDIST_SIZE;
-               alignment = SZ_64K;
-               break;
-#endif
-       default:
-               r = -ENODEV;
-               goto out;
-       }
-
-       if (vgic->vgic_model != type_needed) {
-               r = -ENODEV;
-               goto out;
-       }
-
-       if (write) {
-               if (!IS_ALIGNED(*addr, alignment))
-                       r = -EINVAL;
-               else
-                       r = vgic_ioaddr_assign(kvm, addr_ptr, *addr,
-                                              block_size);
-       } else {
-               *addr = *addr_ptr;
-       }
-
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-       int r;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 addr;
-               unsigned long type = (unsigned long)attr->attr;
-
-               if (copy_from_user(&addr, uaddr, sizeof(addr)))
-                       return -EFAULT;
-
-               r = kvm_vgic_addr(dev->kvm, type, &addr, true);
-               return (r == -ENODEV) ? -ENXIO : r;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 val;
-               int ret = 0;
-
-               if (get_user(val, uaddr))
-                       return -EFAULT;
-
-               /*
-                * We require:
-                * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
-                * - at most 1024 interrupts
-                * - a multiple of 32 interrupts
-                */
-               if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
-                   val > VGIC_MAX_IRQS ||
-                   (val & 31))
-                       return -EINVAL;
-
-               mutex_lock(&dev->kvm->lock);
-
-               if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
-                       ret = -EBUSY;
-               else
-                       dev->kvm->arch.vgic.nr_irqs = val;
-
-               mutex_unlock(&dev->kvm->lock);
-
-               return ret;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       r = vgic_init(dev->kvm);
-                       return r;
-               }
-               break;
-       }
-       }
-
-       return -ENXIO;
-}
-
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-       int r = -ENXIO;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 addr;
-               unsigned long type = (unsigned long)attr->attr;
-
-               r = kvm_vgic_addr(dev->kvm, type, &addr, false);
-               if (r)
-                       return (r == -ENODEV) ? -ENXIO : r;
-
-               if (copy_to_user(uaddr, &addr, sizeof(addr)))
-                       return -EFAULT;
-               break;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
-               r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
-               break;
-       }
-
-       }
-
-       return r;
-}
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset)
-{
-       if (vgic_find_range(ranges, 4, offset))
-               return 0;
-       else
-               return -ENXIO;
-}
-
-static int vgic_starting_cpu(unsigned int cpu)
-{
-       enable_percpu_irq(vgic->maint_irq, 0);
-       return 0;
-}
-
-static int vgic_dying_cpu(unsigned int cpu)
-{
-       disable_percpu_irq(vgic->maint_irq);
-       return 0;
-}
-
-static int kvm_vgic_probe(void)
-{
-       const struct gic_kvm_info *gic_kvm_info;
-       int ret;
-
-       gic_kvm_info = gic_get_kvm_info();
-       if (!gic_kvm_info)
-               return -ENODEV;
-
-       switch (gic_kvm_info->type) {
-       case GIC_V2:
-               ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic);
-               break;
-       case GIC_V3:
-               ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic);
-               break;
-       default:
-               ret = -ENODEV;
-       }
-
-       return ret;
-}
-
-int kvm_vgic_hyp_init(void)
-{
-       int ret;
-
-       ret = kvm_vgic_probe();
-       if (ret) {
-               kvm_err("error: KVM vGIC probing failed\n");
-               return ret;
-       }
-
-       ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler,
-                                "vgic", kvm_get_running_vcpus());
-       if (ret) {
-               kvm_err("Cannot register interrupt %d\n", vgic->maint_irq);
-               return ret;
-       }
-
-       cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_STARTING,
-                         "AP_KVM_ARM_VGIC_STARTING", vgic_starting_cpu,
-                         vgic_dying_cpu);
-       return 0;
-}
-
-int kvm_irq_map_gsi(struct kvm *kvm,
-                   struct kvm_kernel_irq_routing_entry *entries,
-                   int gsi)
-{
-       return 0;
-}
-
-int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
-       return pin;
-}
-
-int kvm_set_irq(struct kvm *kvm, int irq_source_id,
-               u32 irq, int level, bool line_status)
-{
-       unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
-
-       trace_kvm_set_irq(irq, level, irq_source_id);
-
-       BUG_ON(!vgic_initialized(kvm));
-
-       return kvm_vgic_inject_irq(kvm, 0, spi, level);
-}
-
-/* MSI not implemented yet */
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-               struct kvm *kvm, int irq_source_id,
-               int level, bool line_status)
-{
-       return 0;
-}
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
deleted file mode 100644 (file)
index 0df74cb..0000000
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2012-2014 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * Derived from virt/kvm/arm/vgic.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __KVM_VGIC_H__
-#define __KVM_VGIC_H__
-
-#include <kvm/iodev.h>
-
-#define VGIC_ADDR_UNDEF                (-1)
-#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
-
-#define PRODUCT_ID_KVM         0x4b    /* ASCII code K */
-#define IMPLEMENTER_ARM                0x43b
-
-#define ACCESS_READ_VALUE      (1 << 0)
-#define ACCESS_READ_RAZ                (0 << 0)
-#define ACCESS_READ_MASK(x)    ((x) & (1 << 0))
-#define ACCESS_WRITE_IGNORED   (0 << 1)
-#define ACCESS_WRITE_SETBIT    (1 << 1)
-#define ACCESS_WRITE_CLEARBIT  (2 << 1)
-#define ACCESS_WRITE_VALUE     (3 << 1)
-#define ACCESS_WRITE_MASK(x)   ((x) & (3 << 1))
-
-#define VCPU_NOT_ALLOCATED     ((u8)-1)
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
-
-void vgic_update_state(struct kvm *kvm);
-int vgic_init_common_maps(struct kvm *kvm);
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset);
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset);
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq);
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
-                            int irq, int val);
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq);
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu);
-
-struct kvm_exit_mmio {
-       phys_addr_t     phys_addr;
-       void            *data;
-       u32             len;
-       bool            is_write;
-       void            *private;
-};
-
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
-                    phys_addr_t offset, int mode);
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-                       phys_addr_t offset);
-
-static inline
-u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
-{
-       return le32_to_cpu(*((u32 *)mmio->data)) & mask;
-}
-
-static inline
-void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
-{
-       *((u32 *)mmio->data) = cpu_to_le32(value) & mask;
-}
-
-struct vgic_io_range {
-       phys_addr_t base;
-       unsigned long len;
-       int bits_per_irq;
-       bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-                           phys_addr_t offset);
-};
-
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
-                            const struct vgic_io_range *ranges,
-                            int redist_id,
-                            struct vgic_io_device *iodev);
-
-static inline bool is_in_range(phys_addr_t addr, unsigned long len,
-                              phys_addr_t baseaddr, unsigned long size)
-{
-       return (addr >= baseaddr) && (addr + len <= baseaddr + size);
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
-                                     int len, gpa_t offset);
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                           phys_addr_t offset, int vcpu_id, int access);
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                                phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                                  phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
-                               struct kvm_exit_mmio *mmio,
-                               phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
-                        phys_addr_t offset);
-
-void vgic_kick_vcpus(struct kvm *kvm);
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset);
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-
-int vgic_init(struct kvm *kvm);
-void vgic_v2_init_emulation(struct kvm *kvm);
-void vgic_v3_init_emulation(struct kvm *kvm);
-
-#endif
index 2c7f0d5..1e30ce0 100644 (file)
@@ -157,6 +157,9 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
        struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
        int i;
 
+       INIT_LIST_HEAD(&dist->lpi_list_head);
+       spin_lock_init(&dist->lpi_list_lock);
+
        dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
        if (!dist->spis)
                return  -ENOMEM;
@@ -177,6 +180,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
                spin_lock_init(&irq->irq_lock);
                irq->vcpu = NULL;
                irq->target_vcpu = vcpu0;
+               kref_init(&irq->refcount);
                if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
                        irq->targets = 0;
                else
@@ -211,6 +215,7 @@ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
                irq->vcpu = NULL;
                irq->target_vcpu = vcpu;
                irq->targets = 1U << vcpu->vcpu_id;
+               kref_init(&irq->refcount);
                if (vgic_irq_is_sgi(i)) {
                        /* SGIs */
                        irq->enabled = 1;
@@ -253,6 +258,9 @@ int vgic_init(struct kvm *kvm)
        if (ret)
                goto out;
 
+       if (vgic_has_its(kvm))
+               dist->msis_require_devid = true;
+
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_vgic_vcpu_init(vcpu);
 
@@ -271,7 +279,6 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
        dist->initialized = false;
 
        kfree(dist->spis);
-       kfree(dist->redist_iodevs);
        dist->nr_spis = 0;
 
        mutex_unlock(&kvm->lock);
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
new file mode 100644 (file)
index 0000000..07411cf
--- /dev/null
@@ -0,0 +1,1500 @@
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015,2016 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/*
+ * Creates a new (reference to a) struct vgic_irq for a given LPI.
+ * If this LPI is already mapped on another ITS, we increase its refcount
+ * and return a pointer to the existing structure.
+ * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
+ * This function returns a pointer to the _unlocked_ structure.
+ */
+static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+
+       /* In this case there is no put, since we keep the reference. */
+       if (irq)
+               return irq;
+
+       irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
+       if (!irq)
+               return NULL;
+
+       INIT_LIST_HEAD(&irq->lpi_list);
+       INIT_LIST_HEAD(&irq->ap_list);
+       spin_lock_init(&irq->irq_lock);
+
+       irq->config = VGIC_CONFIG_EDGE;
+       kref_init(&irq->refcount);
+       irq->intid = intid;
+
+       spin_lock(&dist->lpi_list_lock);
+
+       /*
+        * There could be a race with another vgic_add_lpi(), so we need to
+        * check that we don't add a second list entry with the same LPI.
+        */
+       list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+               if (oldirq->intid != intid)
+                       continue;
+
+               /* Someone was faster with adding this LPI, lets use that. */
+               kfree(irq);
+               irq = oldirq;
+
+               /*
+                * This increases the refcount, the caller is expected to
+                * call vgic_put_irq() on the returned pointer once it's
+                * finished with the IRQ.
+                */
+               vgic_get_irq_kref(irq);
+
+               goto out_unlock;
+       }
+
+       list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+       dist->lpi_list_count++;
+
+out_unlock:
+       spin_unlock(&dist->lpi_list_lock);
+
+       return irq;
+}
+
+struct its_device {
+       struct list_head dev_list;
+
+       /* the head for the list of ITTEs */
+       struct list_head itt_head;
+       u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+       struct list_head coll_list;
+
+       u32 collection_id;
+       u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+                               ((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_itte {
+       struct list_head itte_list;
+
+       struct vgic_irq *irq;
+       struct its_collection *collection;
+       u32 lpi;
+       u32 event_id;
+};
+
+/*
+ * Find and returns a device in the device table for an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
+{
+       struct its_device *device;
+
+       list_for_each_entry(device, &its->device_list, dev_list)
+               if (device_id == device->device_id)
+                       return device;
+
+       return NULL;
+}
+
+/*
+ * Find and returns an interrupt translation table entry (ITTE) for a given
+ * Device ID/Event ID pair on an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_itte *find_itte(struct vgic_its *its, u32 device_id,
+                                 u32 event_id)
+{
+       struct its_device *device;
+       struct its_itte *itte;
+
+       device = find_its_device(its, device_id);
+       if (device == NULL)
+               return NULL;
+
+       list_for_each_entry(itte, &device->itt_head, itte_list)
+               if (itte->event_id == event_id)
+                       return itte;
+
+       return NULL;
+}
+
+/* To be used as an iterator this macro misses the enclosing parentheses */
+#define for_each_lpi_its(dev, itte, its) \
+       list_for_each_entry(dev, &(its)->device_list, dev_list) \
+               list_for_each_entry(itte, &(dev)->itt_head, itte_list)
+
+/*
+ * We only implement 48 bits of PA at the moment, although the ITS
+ * supports more. Let's be restrictive here.
+ */
+#define BASER_ADDRESS(x)       ((x) & GENMASK_ULL(47, 16))
+#define CBASER_ADDRESS(x)      ((x) & GENMASK_ULL(47, 12))
+#define PENDBASER_ADDRESS(x)   ((x) & GENMASK_ULL(47, 16))
+#define PROPBASER_ADDRESS(x)   ((x) & GENMASK_ULL(47, 12))
+
+#define GIC_LPI_OFFSET 8192
+
+/*
+ * Finds and returns a collection in the ITS collection table.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
+{
+       struct its_collection *collection;
+
+       list_for_each_entry(collection, &its->collection_list, coll_list) {
+               if (coll_id == collection->collection_id)
+                       return collection;
+       }
+
+       return NULL;
+}
+
+#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p)   ((p) & 0xfc)
+
+/*
+ * Reads the configuration data for a given LPI from guest memory and
+ * updates the fields in struct vgic_irq.
+ * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
+ * VCPU. Unconditionally applies if filter_vcpu is NULL.
+ */
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+                            struct kvm_vcpu *filter_vcpu)
+{
+       u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+       u8 prop;
+       int ret;
+
+       ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+                            &prop, 1);
+
+       if (ret)
+               return ret;
+
+       spin_lock(&irq->irq_lock);
+
+       if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
+               irq->priority = LPI_PROP_PRIORITY(prop);
+               irq->enabled = LPI_PROP_ENABLE_BIT(prop);
+
+               vgic_queue_irq_unlock(kvm, irq);
+       } else {
+               spin_unlock(&irq->irq_lock);
+       }
+
+       return 0;
+}
+
+/*
+ * Create a snapshot of the current LPI list, so that we can enumerate all
+ * LPIs without holding any lock.
+ * Returns the array length and puts the kmalloc'ed array into intid_ptr.
+ */
+static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq;
+       u32 *intids;
+       int irq_count = dist->lpi_list_count, i = 0;
+
+       /*
+        * We use the current value of the list length, which may change
+        * after the kmalloc. We don't care, because the guest shouldn't
+        * change anything while the command handling is still running,
+        * and in the worst case we would miss a new IRQ, which one wouldn't
+        * expect to be covered by this command anyway.
+        */
+       intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
+       if (!intids)
+               return -ENOMEM;
+
+       spin_lock(&dist->lpi_list_lock);
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               /* We don't need to "get" the IRQ, as we hold the list lock. */
+               intids[i] = irq->intid;
+               if (++i == irq_count)
+                       break;
+       }
+       spin_unlock(&dist->lpi_list_lock);
+
+       *intid_ptr = intids;
+       return irq_count;
+}
+
+/*
+ * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
+ * is targeting) to the VGIC's view, which deals with target VCPUs.
+ * Needs to be called whenever either the collection for a LPIs has
+ * changed or the collection itself got retargeted.
+ */
+static void update_affinity_itte(struct kvm *kvm, struct its_itte *itte)
+{
+       struct kvm_vcpu *vcpu;
+
+       if (!its_is_collection_mapped(itte->collection))
+               return;
+
+       vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+
+       spin_lock(&itte->irq->irq_lock);
+       itte->irq->target_vcpu = vcpu;
+       spin_unlock(&itte->irq->irq_lock);
+}
+
+/*
+ * Updates the target VCPU for every LPI targeting this collection.
+ * Must be called with the its_lock mutex held.
+ */
+static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
+                                      struct its_collection *coll)
+{
+       struct its_device *device;
+       struct its_itte *itte;
+
+       for_each_lpi_its(device, itte, its) {
+               if (!itte->collection || coll != itte->collection)
+                       continue;
+
+               update_affinity_itte(kvm, itte);
+       }
+}
+
+static u32 max_lpis_propbaser(u64 propbaser)
+{
+       int nr_idbits = (propbaser & 0x1f) + 1;
+
+       return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
+}
+
+/*
+ * Scan the whole LPI pending table and sync the pending bit in there
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ */
+static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+       gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+       struct vgic_irq *irq;
+       int last_byte_offset = -1;
+       int ret = 0;
+       u32 *intids;
+       int nr_irqs, i;
+
+       nr_irqs = vgic_copy_lpi_list(vcpu->kvm, &intids);
+       if (nr_irqs < 0)
+               return nr_irqs;
+
+       for (i = 0; i < nr_irqs; i++) {
+               int byte_offset, bit_nr;
+               u8 pendmask;
+
+               byte_offset = intids[i] / BITS_PER_BYTE;
+               bit_nr = intids[i] % BITS_PER_BYTE;
+
+               /*
+                * For contiguously allocated LPIs chances are we just read
+                * this very same byte in the last iteration. Reuse that.
+                */
+               if (byte_offset != last_byte_offset) {
+                       ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset,
+                                            &pendmask, 1);
+                       if (ret) {
+                               kfree(intids);
+                               return ret;
+                       }
+                       last_byte_offset = byte_offset;
+               }
+
+               irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+               spin_lock(&irq->irq_lock);
+               irq->pending = pendmask & (1U << bit_nr);
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       kfree(intids);
+
+       return ret;
+}
+
+static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
+                                            struct vgic_its *its,
+                                            gpa_t addr, unsigned int len)
+{
+       u32 reg = 0;
+
+       mutex_lock(&its->cmd_lock);
+       if (its->creadr == its->cwriter)
+               reg |= GITS_CTLR_QUIESCENT;
+       if (its->enabled)
+               reg |= GITS_CTLR_ENABLE;
+       mutex_unlock(&its->cmd_lock);
+
+       return reg;
+}
+
+static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       its->enabled = !!(val & GITS_CTLR_ENABLE);
+}
+
+static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
+                                             struct vgic_its *its,
+                                             gpa_t addr, unsigned int len)
+{
+       u64 reg = GITS_TYPER_PLPIS;
+
+       /*
+        * We use linear CPU numbers for redistributor addressing,
+        * so GITS_TYPER.PTA is 0.
+        * Also we force all PROPBASER registers to be the same, so
+        * CommonLPIAff is 0 as well.
+        * To avoid memory waste in the guest, we keep the number of IDBits and
+        * DevBits low - as least for the time being.
+        */
+       reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
+       reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+
+       return extract_bytes(reg, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
+                                            struct vgic_its *its,
+                                            gpa_t addr, unsigned int len)
+{
+       return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       switch (addr & 0xffff) {
+       case GITS_PIDR0:
+               return 0x92;    /* part number, bits[7:0] */
+       case GITS_PIDR1:
+               return 0xb4;    /* part number, bits[11:8] */
+       case GITS_PIDR2:
+               return GIC_PIDR2_ARCH_GICv3 | 0x0b;
+       case GITS_PIDR4:
+               return 0x40;    /* This is a 64K software visible page */
+       /* The following are the ID registers for (any) GIC. */
+       case GITS_CIDR0:
+               return 0x0d;
+       case GITS_CIDR1:
+               return 0xf0;
+       case GITS_CIDR2:
+               return 0x05;
+       case GITS_CIDR3:
+               return 0xb1;
+       }
+
+       return 0;
+}
+
+/*
+ * Find the target VCPU and the LPI number for a given devid/eventid pair
+ * and make this IRQ pending, possibly injecting it.
+ * Must be called with the its_lock mutex held.
+ */
+static void vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
+                                u32 devid, u32 eventid)
+{
+       struct its_itte *itte;
+
+       if (!its->enabled)
+               return;
+
+       itte = find_itte(its, devid, eventid);
+       /* Triggering an unmapped IRQ gets silently dropped. */
+       if (itte && its_is_collection_mapped(itte->collection)) {
+               struct kvm_vcpu *vcpu;
+
+               vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+               if (vcpu && vcpu->arch.vgic_cpu.lpis_enabled) {
+                       spin_lock(&itte->irq->irq_lock);
+                       itte->irq->pending = true;
+                       vgic_queue_irq_unlock(kvm, itte->irq);
+               }
+       }
+}
+
+/*
+ * Queries the KVM IO bus framework to get the ITS pointer from the given
+ * doorbell address.
+ * We then call vgic_its_trigger_msi() with the decoded data.
+ */
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+       u64 address;
+       struct kvm_io_device *kvm_io_dev;
+       struct vgic_io_device *iodev;
+
+       if (!vgic_has_its(kvm))
+               return -ENODEV;
+
+       if (!(msi->flags & KVM_MSI_VALID_DEVID))
+               return -EINVAL;
+
+       address = (u64)msi->address_hi << 32 | msi->address_lo;
+
+       kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
+       if (!kvm_io_dev)
+               return -ENODEV;
+
+       iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+
+       mutex_lock(&iodev->its->its_lock);
+       vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data);
+       mutex_unlock(&iodev->its->its_lock);
+
+       return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void its_free_itte(struct kvm *kvm, struct its_itte *itte)
+{
+       list_del(&itte->itte_list);
+
+       /* This put matches the get in vgic_add_lpi. */
+       vgic_put_irq(kvm, itte->irq);
+
+       kfree(itte);
+}
+
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+       return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd)       its_cmd_mask_field(cmd, 0,  0,  8)
+#define its_cmd_get_deviceid(cmd)      its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_id(cmd)            its_cmd_mask_field(cmd, 1,  0, 32)
+#define its_cmd_get_physical_id(cmd)   its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd)    its_cmd_mask_field(cmd, 2,  0, 16)
+#define its_cmd_get_target_addr(cmd)   its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd)      its_cmd_mask_field(cmd, 2, 63,  1)
+
+/*
+ * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
+                                      u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_itte *itte;
+
+
+       itte = find_itte(its, device_id, event_id);
+       if (itte && itte->collection) {
+               /*
+                * Though the spec talks about removing the pending state, we
+                * don't bother here since we clear the ITTE anyway and the
+                * pending state is a property of the ITTE struct.
+                */
+               its_free_itte(kvm, itte);
+               return 0;
+       }
+
+       return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+}
+
+/*
+ * The MOVI command moves an ITTE to a different collection.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct kvm_vcpu *vcpu;
+       struct its_itte *itte;
+       struct its_collection *collection;
+
+       itte = find_itte(its, device_id, event_id);
+       if (!itte)
+               return E_ITS_MOVI_UNMAPPED_INTERRUPT;
+
+       if (!its_is_collection_mapped(itte->collection))
+               return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+       collection = find_collection(its, coll_id);
+       if (!its_is_collection_mapped(collection))
+               return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+       itte->collection = collection;
+       vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+       spin_lock(&itte->irq->irq_lock);
+       itte->irq->target_vcpu = vcpu;
+       spin_unlock(&itte->irq->irq_lock);
+
+       return 0;
+}
+
+/*
+ * Check whether an ID can be stored into the corresponding guest table.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessbible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_id(struct vgic_its *its, u64 baser, int id)
+{
+       int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+       int index;
+       u64 indirect_ptr;
+       gfn_t gfn;
+
+       if (!(baser & GITS_BASER_INDIRECT)) {
+               phys_addr_t addr;
+
+               if (id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(baser)))
+                       return false;
+
+               addr = BASER_ADDRESS(baser) + id * GITS_BASER_ENTRY_SIZE(baser);
+               gfn = addr >> PAGE_SHIFT;
+
+               return kvm_is_visible_gfn(its->dev->kvm, gfn);
+       }
+
+       /* calculate and check the index into the 1st level */
+       index = id / (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+       if (index >= (l1_tbl_size / sizeof(u64)))
+               return false;
+
+       /* Each 1st level entry is represented by a 64-bit value. */
+       if (kvm_read_guest(its->dev->kvm,
+                          BASER_ADDRESS(baser) + index * sizeof(indirect_ptr),
+                          &indirect_ptr, sizeof(indirect_ptr)))
+               return false;
+
+       indirect_ptr = le64_to_cpu(indirect_ptr);
+
+       /* check the valid bit of the first level entry */
+       if (!(indirect_ptr & BIT_ULL(63)))
+               return false;
+
+       /*
+        * Mask the guest physical address and calculate the frame number.
+        * Any address beyond our supported 48 bits of PA will be caught
+        * by the actual check in the final step.
+        */
+       indirect_ptr &= GENMASK_ULL(51, 16);
+
+       /* Find the address of the actual entry */
+       index = id % (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+       indirect_ptr += index * GITS_BASER_ENTRY_SIZE(baser);
+       gfn = indirect_ptr >> PAGE_SHIFT;
+
+       return kvm_is_visible_gfn(its->dev->kvm, gfn);
+}
+
+static int vgic_its_alloc_collection(struct vgic_its *its,
+                                    struct its_collection **colp,
+                                    u32 coll_id)
+{
+       struct its_collection *collection;
+
+       if (!vgic_its_check_id(its, its->baser_coll_table, coll_id))
+               return E_ITS_MAPC_COLLECTION_OOR;
+
+       collection = kzalloc(sizeof(*collection), GFP_KERNEL);
+
+       collection->collection_id = coll_id;
+       collection->target_addr = COLLECTION_NOT_MAPPED;
+
+       list_add_tail(&collection->coll_list, &its->collection_list);
+       *colp = collection;
+
+       return 0;
+}
+
+static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
+{
+       struct its_collection *collection;
+       struct its_device *device;
+       struct its_itte *itte;
+
+       /*
+        * Clearing the mapping for that collection ID removes the
+        * entry from the list. If there wasn't any before, we can
+        * go home early.
+        */
+       collection = find_collection(its, coll_id);
+       if (!collection)
+               return;
+
+       for_each_lpi_its(device, itte, its)
+               if (itte->collection &&
+                   itte->collection->collection_id == coll_id)
+                       itte->collection = NULL;
+
+       list_del(&collection->coll_list);
+       kfree(collection);
+}
+
+/*
+ * The MAPTI and MAPI commands map LPIs to ITTEs.
+ * Must be called with its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct its_itte *itte;
+       struct its_device *device;
+       struct its_collection *collection, *new_coll = NULL;
+       int lpi_nr;
+
+       device = find_its_device(its, device_id);
+       if (!device)
+               return E_ITS_MAPTI_UNMAPPED_DEVICE;
+
+       if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
+               lpi_nr = its_cmd_get_physical_id(its_cmd);
+       else
+               lpi_nr = event_id;
+       if (lpi_nr < GIC_LPI_OFFSET ||
+           lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
+               return E_ITS_MAPTI_PHYSICALID_OOR;
+
+       collection = find_collection(its, coll_id);
+       if (!collection) {
+               int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+               if (ret)
+                       return ret;
+               new_coll = collection;
+       }
+
+       itte = find_itte(its, device_id, event_id);
+       if (!itte) {
+               itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL);
+               if (!itte) {
+                       if (new_coll)
+                               vgic_its_free_collection(its, coll_id);
+                       return -ENOMEM;
+               }
+
+               itte->event_id  = event_id;
+               list_add_tail(&itte->itte_list, &device->itt_head);
+       }
+
+       itte->collection = collection;
+       itte->lpi = lpi_nr;
+       itte->irq = vgic_add_lpi(kvm, lpi_nr);
+       update_affinity_itte(kvm, itte);
+
+       /*
+        * We "cache" the configuration table entries in out struct vgic_irq's.
+        * However we only have those structs for mapped IRQs, so we read in
+        * the respective config data from memory here upon mapping the LPI.
+        */
+       update_lpi_config(kvm, itte->irq, NULL);
+
+       return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
+{
+       struct its_itte *itte, *temp;
+
+       /*
+        * The spec says that unmapping a device with still valid
+        * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
+        * since we cannot leave the memory unreferenced.
+        */
+       list_for_each_entry_safe(itte, temp, &device->itt_head, itte_list)
+               its_free_itte(kvm, itte);
+
+       list_del(&device->dev_list);
+       kfree(device);
+}
+
+/*
+ * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       bool valid = its_cmd_get_validbit(its_cmd);
+       struct its_device *device;
+
+       if (!vgic_its_check_id(its, its->baser_device_table, device_id))
+               return E_ITS_MAPD_DEVICE_OOR;
+
+       device = find_its_device(its, device_id);
+
+       /*
+        * The spec says that calling MAPD on an already mapped device
+        * invalidates all cached data for this device. We implement this
+        * by removing the mapping and re-establishing it.
+        */
+       if (device)
+               vgic_its_unmap_device(kvm, device);
+
+       /*
+        * The spec does not say whether unmapping a not-mapped device
+        * is an error, so we are done in any case.
+        */
+       if (!valid)
+               return 0;
+
+       device = kzalloc(sizeof(struct its_device), GFP_KERNEL);
+       if (!device)
+               return -ENOMEM;
+
+       device->device_id = device_id;
+       INIT_LIST_HEAD(&device->itt_head);
+
+       list_add_tail(&device->dev_list, &its->device_list);
+
+       return 0;
+}
+
+/*
+ * The MAPC command maps collection IDs to redistributors.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u16 coll_id;
+       u32 target_addr;
+       struct its_collection *collection;
+       bool valid;
+
+       valid = its_cmd_get_validbit(its_cmd);
+       coll_id = its_cmd_get_collection(its_cmd);
+       target_addr = its_cmd_get_target_addr(its_cmd);
+
+       if (target_addr >= atomic_read(&kvm->online_vcpus))
+               return E_ITS_MAPC_PROCNUM_OOR;
+
+       if (!valid) {
+               vgic_its_free_collection(its, coll_id);
+       } else {
+               collection = find_collection(its, coll_id);
+
+               if (!collection) {
+                       int ret;
+
+                       ret = vgic_its_alloc_collection(its, &collection,
+                                                       coll_id);
+                       if (ret)
+                               return ret;
+                       collection->target_addr = target_addr;
+               } else {
+                       collection->target_addr = target_addr;
+                       update_affinity_collection(kvm, its, collection);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * The CLEAR command removes the pending state for a particular LPI.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
+                                    u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_itte *itte;
+
+
+       itte = find_itte(its, device_id, event_id);
+       if (!itte)
+               return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
+
+       itte->irq->pending = false;
+
+       return 0;
+}
+
+/*
+ * The INV command syncs the configuration bits from the memory table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_itte *itte;
+
+
+       itte = find_itte(its, device_id, event_id);
+       if (!itte)
+               return E_ITS_INV_UNMAPPED_INTERRUPT;
+
+       return update_lpi_config(kvm, itte->irq, NULL);
+}
+
+/*
+ * The INVALL command requests flushing of all IRQ data in this collection.
+ * Find the VCPU mapped to that collection, then iterate over the VM's list
+ * of mapped LPIs and update the configuration for each IRQ which targets
+ * the specified vcpu. The configuration will be read from the in-memory
+ * configuration table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
+                                     u64 *its_cmd)
+{
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct its_collection *collection;
+       struct kvm_vcpu *vcpu;
+       struct vgic_irq *irq;
+       u32 *intids;
+       int irq_count, i;
+
+       collection = find_collection(its, coll_id);
+       if (!its_is_collection_mapped(collection))
+               return E_ITS_INVALL_UNMAPPED_COLLECTION;
+
+       vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+       irq_count = vgic_copy_lpi_list(kvm, &intids);
+       if (irq_count < 0)
+               return irq_count;
+
+       for (i = 0; i < irq_count; i++) {
+               irq = vgic_get_irq(kvm, NULL, intids[i]);
+               if (!irq)
+                       continue;
+               update_lpi_config(kvm, irq, vcpu);
+               vgic_put_irq(kvm, irq);
+       }
+
+       kfree(intids);
+
+       return 0;
+}
+
+/*
+ * The MOVALL command moves the pending state of all IRQs targeting one
+ * redistributor to another. We don't hold the pending state in the VCPUs,
+ * but in the IRQs instead, so there is really not much to do for us here.
+ * However the spec says that no IRQ must target the old redistributor
+ * afterwards, so we make sure that no LPI is using the associated target_vcpu.
+ * This command affects all LPIs in the system that target that redistributor.
+ */
+static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
+                                     u64 *its_cmd)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       u32 target1_addr = its_cmd_get_target_addr(its_cmd);
+       u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+       struct kvm_vcpu *vcpu1, *vcpu2;
+       struct vgic_irq *irq;
+
+       if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
+           target2_addr >= atomic_read(&kvm->online_vcpus))
+               return E_ITS_MOVALL_PROCNUM_OOR;
+
+       if (target1_addr == target2_addr)
+               return 0;
+
+       vcpu1 = kvm_get_vcpu(kvm, target1_addr);
+       vcpu2 = kvm_get_vcpu(kvm, target2_addr);
+
+       spin_lock(&dist->lpi_list_lock);
+
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               spin_lock(&irq->irq_lock);
+
+               if (irq->target_vcpu == vcpu1)
+                       irq->target_vcpu = vcpu2;
+
+               spin_unlock(&irq->irq_lock);
+       }
+
+       spin_unlock(&dist->lpi_list_lock);
+
+       return 0;
+}
+
+/*
+ * The INT command injects the LPI associated with that DevID/EvID pair.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       u32 msi_data = its_cmd_get_id(its_cmd);
+       u64 msi_devid = its_cmd_get_deviceid(its_cmd);
+
+       vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
+
+       return 0;
+}
+
+/*
+ * This function is called with the its_cmd lock held, but the ITS data
+ * structure lock dropped.
+ */
+static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       int ret = -ENODEV;
+
+       mutex_lock(&its->its_lock);
+       switch (its_cmd_get_command(its_cmd)) {
+       case GITS_CMD_MAPD:
+               ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPC:
+               ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPI:
+               ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPTI:
+               ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MOVI:
+               ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_DISCARD:
+               ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_CLEAR:
+               ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MOVALL:
+               ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INT:
+               ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INV:
+               ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INVALL:
+               ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_SYNC:
+               /* we ignore this command: we are in sync all of the time */
+               ret = 0;
+               break;
+       }
+       mutex_unlock(&its->its_lock);
+
+       return ret;
+}
+
+static u64 vgic_sanitise_its_baser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
+                                 GITS_BASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
+                                 GITS_BASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
+                                 GITS_BASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */
+       reg &= ~GENMASK_ULL(15, 12);
+
+       /* We support only one (ITS) page size: 64K */
+       reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
+
+       return reg;
+}
+
+static u64 vgic_sanitise_its_cbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
+                                 GITS_CBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
+                                 GITS_CBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
+                                 GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       /*
+        * Sanitise the physical address to be 64k aligned.
+        * Also limit the physical addresses to 48 bits.
+        */
+       reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12));
+
+       return reg;
+}
+
+static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->cbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
+                                      gpa_t addr, unsigned int len,
+                                      unsigned long val)
+{
+       /* When GITS_CTLR.Enable is 1, this register is RO. */
+       if (its->enabled)
+               return;
+
+       mutex_lock(&its->cmd_lock);
+       its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
+       its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
+       its->creadr = 0;
+       /*
+        * CWRITER is architecturally UNKNOWN on reset, but we need to reset
+        * it to CREADR to make sure we start with an empty command buffer.
+        */
+       its->cwriter = its->creadr;
+       mutex_unlock(&its->cmd_lock);
+}
+
+#define ITS_CMD_BUFFER_SIZE(baser)     ((((baser) & 0xff) + 1) << 12)
+#define ITS_CMD_SIZE                   32
+#define ITS_CMD_OFFSET(reg)            ((reg) & GENMASK(19, 5))
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * To avoid any races in the first place, we take the its_cmd lock, which
+ * protects our ring buffer variables, so that there is only one user
+ * per ITS handling commands at a given time.
+ */
+static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
+                                       gpa_t addr, unsigned int len,
+                                       unsigned long val)
+{
+       gpa_t cbaser;
+       u64 cmd_buf[4];
+       u32 reg;
+
+       if (!its)
+               return;
+
+       mutex_lock(&its->cmd_lock);
+
+       reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
+       reg = ITS_CMD_OFFSET(reg);
+       if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+               mutex_unlock(&its->cmd_lock);
+               return;
+       }
+
+       its->cwriter = reg;
+       cbaser = CBASER_ADDRESS(its->cbaser);
+
+       while (its->cwriter != its->creadr) {
+               int ret = kvm_read_guest(kvm, cbaser + its->creadr,
+                                        cmd_buf, ITS_CMD_SIZE);
+               /*
+                * If kvm_read_guest() fails, this could be due to the guest
+                * programming a bogus value in CBASER or something else going
+                * wrong from which we cannot easily recover.
+                * According to section 6.3.2 in the GICv3 spec we can just
+                * ignore that command then.
+                */
+               if (!ret)
+                       vgic_its_handle_command(kvm, its, cmd_buf);
+
+               its->creadr += ITS_CMD_SIZE;
+               if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
+                       its->creadr = 0;
+       }
+
+       mutex_unlock(&its->cmd_lock);
+}
+
+static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
+                                               struct vgic_its *its,
+                                               gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->cwriter, addr & 0x7, len);
+}
+
+static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->creadr, addr & 0x7, len);
+}
+
+#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
+static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
+                                             struct vgic_its *its,
+                                             gpa_t addr, unsigned int len)
+{
+       u64 reg;
+
+       switch (BASER_INDEX(addr)) {
+       case 0:
+               reg = its->baser_device_table;
+               break;
+       case 1:
+               reg = its->baser_coll_table;
+               break;
+       default:
+               reg = 0;
+               break;
+       }
+
+       return extract_bytes(reg, addr & 7, len);
+}
+
+#define GITS_BASER_RO_MASK     (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
+static void vgic_mmio_write_its_baser(struct kvm *kvm,
+                                     struct vgic_its *its,
+                                     gpa_t addr, unsigned int len,
+                                     unsigned long val)
+{
+       u64 entry_size, device_type;
+       u64 reg, *regptr, clearbits = 0;
+
+       /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
+       if (its->enabled)
+               return;
+
+       switch (BASER_INDEX(addr)) {
+       case 0:
+               regptr = &its->baser_device_table;
+               entry_size = 8;
+               device_type = GITS_BASER_TYPE_DEVICE;
+               break;
+       case 1:
+               regptr = &its->baser_coll_table;
+               entry_size = 8;
+               device_type = GITS_BASER_TYPE_COLLECTION;
+               clearbits = GITS_BASER_INDIRECT;
+               break;
+       default:
+               return;
+       }
+
+       reg = update_64bit_reg(*regptr, addr & 7, len, val);
+       reg &= ~GITS_BASER_RO_MASK;
+       reg &= ~clearbits;
+
+       reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
+       reg |= device_type << GITS_BASER_TYPE_SHIFT;
+       reg = vgic_sanitise_its_baser(reg);
+
+       *regptr = reg;
+}
+
+#define REGISTER_ITS_DESC(off, rd, wr, length, acc)            \
+{                                                              \
+       .reg_offset = off,                                      \
+       .len = length,                                          \
+       .access_flags = acc,                                    \
+       .its_read = rd,                                         \
+       .its_write = wr,                                        \
+}
+
+static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
+                             gpa_t addr, unsigned int len, unsigned long val)
+{
+       /* Ignore */
+}
+
+static struct vgic_register_region its_registers[] = {
+       REGISTER_ITS_DESC(GITS_CTLR,
+               vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_IIDR,
+               vgic_mmio_read_its_iidr, its_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_TYPER,
+               vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_CBASER,
+               vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_CWRITER,
+               vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_CREADR,
+               vgic_mmio_read_its_creadr, its_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_BASER,
+               vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_IDREGS_BASE,
+               vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
+               VGIC_ACCESS_32bit),
+};
+
+/* This is called on setting the LPI enable bit in the redistributor. */
+void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+       if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
+               its_sync_lpi_pending_table(vcpu);
+}
+
+static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
+{
+       struct vgic_io_device *iodev = &its->iodev;
+       int ret;
+
+       if (its->initialized)
+               return 0;
+
+       if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base))
+               return -ENXIO;
+
+       iodev->regions = its_registers;
+       iodev->nr_regions = ARRAY_SIZE(its_registers);
+       kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
+
+       iodev->base_addr = its->vgic_its_base;
+       iodev->iodev_type = IODEV_ITS;
+       iodev->its = its;
+       mutex_lock(&kvm->slots_lock);
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
+                                     KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
+       mutex_unlock(&kvm->slots_lock);
+
+       if (!ret)
+               its->initialized = true;
+
+       return ret;
+}
+
+#define INITIAL_BASER_VALUE                                              \
+       (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)                | \
+        GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)         | \
+        GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)             | \
+        ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)                    | \
+        GITS_BASER_PAGE_SIZE_64K)
+
+#define INITIAL_PROPBASER_VALUE                                                  \
+       (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)            | \
+        GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)     | \
+        GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+       struct vgic_its *its;
+
+       if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+               return -ENODEV;
+
+       its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+       if (!its)
+               return -ENOMEM;
+
+       mutex_init(&its->its_lock);
+       mutex_init(&its->cmd_lock);
+
+       its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+       INIT_LIST_HEAD(&its->device_list);
+       INIT_LIST_HEAD(&its->collection_list);
+
+       dev->kvm->arch.vgic.has_its = true;
+       its->initialized = false;
+       its->enabled = false;
+       its->dev = dev;
+
+       its->baser_device_table = INITIAL_BASER_VALUE                   |
+               ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
+       its->baser_coll_table = INITIAL_BASER_VALUE |
+               ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
+       dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
+
+       dev->private = its;
+
+       return 0;
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+       struct kvm *kvm = kvm_dev->kvm;
+       struct vgic_its *its = kvm_dev->private;
+       struct its_device *dev;
+       struct its_itte *itte;
+       struct list_head *dev_cur, *dev_temp;
+       struct list_head *cur, *temp;
+
+       /*
+        * We may end up here without the lists ever having been initialized.
+        * Check this and bail out early to avoid dereferencing a NULL pointer.
+        */
+       if (!its->device_list.next)
+               return;
+
+       mutex_lock(&its->its_lock);
+       list_for_each_safe(dev_cur, dev_temp, &its->device_list) {
+               dev = container_of(dev_cur, struct its_device, dev_list);
+               list_for_each_safe(cur, temp, &dev->itt_head) {
+                       itte = (container_of(cur, struct its_itte, itte_list));
+                       its_free_itte(kvm, itte);
+               }
+               list_del(dev_cur);
+               kfree(dev);
+       }
+
+       list_for_each_safe(cur, temp, &its->collection_list) {
+               list_del(cur);
+               kfree(container_of(cur, struct its_collection, coll_list));
+       }
+       mutex_unlock(&its->its_lock);
+
+       kfree(its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_ITS_ADDR_TYPE:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               }
+               break;
+       }
+       return -ENXIO;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       struct vgic_its *its = dev->private;
+       int ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               unsigned long type = (unsigned long)attr->attr;
+               u64 addr;
+
+               if (type != KVM_VGIC_ITS_ADDR_TYPE)
+                       return -ENODEV;
+
+               if (its->initialized)
+                       return -EBUSY;
+
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
+               ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+                                       addr, SZ_64K);
+               if (ret)
+                       return ret;
+
+               its->vgic_its_base = addr;
+
+               return 0;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return vgic_its_init_its(dev->kvm, its);
+               }
+               break;
+       }
+       return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               struct vgic_its *its = dev->private;
+               u64 addr = its->vgic_its_base;
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               if (type != KVM_VGIC_ITS_ADDR_TYPE)
+                       return -ENODEV;
+
+               if (copy_to_user(uaddr, &addr, sizeof(addr)))
+                       return -EFAULT;
+               break;
+       default:
+               return -ENXIO;
+       }
+       }
+
+       return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+       .name = "kvm-arm-vgic-its",
+       .create = vgic_its_create,
+       .destroy = vgic_its_destroy,
+       .set_attr = vgic_its_set_attr,
+       .get_attr = vgic_its_get_attr,
+       .has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+       return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+                                      KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
index 0130c4b..1813f93 100644 (file)
@@ -21,8 +21,8 @@
 
 /* common helpers */
 
-static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-                            phys_addr_t addr, phys_addr_t alignment)
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                     phys_addr_t addr, phys_addr_t alignment)
 {
        if (addr & ~KVM_PHYS_MASK)
                return -E2BIG;
@@ -210,20 +210,27 @@ static void vgic_destroy(struct kvm_device *dev)
        kfree(dev);
 }
 
-void kvm_register_vgic_device(unsigned long type)
+int kvm_register_vgic_device(unsigned long type)
 {
+       int ret = -ENODEV;
+
        switch (type) {
        case KVM_DEV_TYPE_ARM_VGIC_V2:
-               kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-                                       KVM_DEV_TYPE_ARM_VGIC_V2);
+               ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+                                             KVM_DEV_TYPE_ARM_VGIC_V2);
                break;
 #ifdef CONFIG_KVM_ARM_VGIC_V3
        case KVM_DEV_TYPE_ARM_VGIC_V3:
-               kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
-                                       KVM_DEV_TYPE_ARM_VGIC_V3);
+               ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+                                             KVM_DEV_TYPE_ARM_VGIC_V3);
+               if (ret)
+                       break;
+               ret = kvm_vgic_register_its_device();
                break;
 #endif
        }
+
+       return ret;
 }
 
 /** vgic_attr_regs_access: allows user space to read/write VGIC registers
@@ -428,4 +435,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
 };
 
 #endif /* CONFIG_KVM_ARM_VGIC_V3 */
-
index a213936..b44b359 100644 (file)
@@ -102,6 +102,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
                irq->source |= 1U << source_vcpu->vcpu_id;
 
                vgic_queue_irq_unlock(source_vcpu->kvm, irq);
+               vgic_put_irq(source_vcpu->kvm, irq);
        }
 }
 
@@ -116,6 +117,8 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
                val |= (u64)irq->targets << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return val;
@@ -143,6 +146,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
                irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -157,6 +161,8 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
                val |= (u64)irq->source << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
        return val;
 }
@@ -178,6 +184,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
                        irq->pending = false;
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -201,6 +208,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
                } else {
                        spin_unlock(&irq->irq_lock);
                }
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -429,6 +437,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
        struct vgic_io_device dev = {
                .regions = vgic_v2_cpu_registers,
                .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+               .iodev_type = IODEV_CPUIF,
        };
 
        return vgic_uaccess(vcpu, &dev, is_write, offset, val);
@@ -440,6 +449,7 @@ int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
        struct vgic_io_device dev = {
                .regions = vgic_v2_dist_registers,
                .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+               .iodev_type = IODEV_DIST,
        };
 
        return vgic_uaccess(vcpu, &dev, is_write, offset, val);
index a0c515a..ff668e0 100644 (file)
 #include "vgic-mmio.h"
 
 /* extract @num bytes at @offset bytes offset in data */
-static unsigned long extract_bytes(unsigned long data, unsigned int offset,
-                                  unsigned int num)
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+                           unsigned int num)
 {
        return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
 }
 
+/* allows updates of any half of a 64-bit register (or the whole thing) */
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+                    unsigned long val)
+{
+       int lower = (offset & 4) * 8;
+       int upper = lower + 8 * len - 1;
+
+       reg &= ~GENMASK_ULL(upper, lower);
+       val &= GENMASK_ULL(len * 8 - 1, 0);
+
+       return reg | ((u64)val << lower);
+}
+
+bool vgic_has_its(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+               return false;
+
+       return dist->has_its;
+}
+
 static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
                                            gpa_t addr, unsigned int len)
 {
@@ -43,7 +66,12 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
        case GICD_TYPER:
                value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
                value = (value >> 5) - 1;
-               value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+               if (vgic_has_its(vcpu->kvm)) {
+                       value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
+                       value |= GICD_TYPER_LPIS;
+               } else {
+                       value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+               }
                break;
        case GICD_IIDR:
                value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
@@ -80,15 +108,17 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
 {
        int intid = VGIC_ADDR_TO_INTID(addr, 64);
        struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+       unsigned long ret = 0;
 
        if (!irq)
                return 0;
 
        /* The upper word is RAZ for us. */
-       if (addr & 4)
-               return 0;
+       if (!(addr & 4))
+               ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
 
-       return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+       vgic_put_irq(vcpu->kvm, irq);
+       return ret;
 }
 
 static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
@@ -96,15 +126,17 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
                                    unsigned long val)
 {
        int intid = VGIC_ADDR_TO_INTID(addr, 64);
-       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-
-       if (!irq)
-               return;
+       struct vgic_irq *irq;
 
        /* The upper word is WI for us since we don't implement Aff3. */
        if (addr & 4)
                return;
 
+       irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+       if (!irq)
+               return;
+
        spin_lock(&irq->irq_lock);
 
        /* We only care about and preserve Aff0, Aff1 and Aff2. */
@@ -112,6 +144,32 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
        irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
 
        spin_unlock(&irq->irq_lock);
+       vgic_put_irq(vcpu->kvm, irq);
+}
+
+static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
+}
+
+
+static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       bool was_enabled = vgic_cpu->lpis_enabled;
+
+       if (!vgic_has_its(vcpu->kvm))
+               return;
+
+       vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+
+       if (!was_enabled && vgic_cpu->lpis_enabled)
+               vgic_enable_lpis(vcpu);
 }
 
 static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
@@ -125,6 +183,8 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
        value |= ((target_vcpu_id & 0xffff) << 8);
        if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
                value |= GICR_TYPER_LAST;
+       if (vgic_has_its(vcpu->kvm))
+               value |= GICR_TYPER_PLPIS;
 
        return extract_bytes(value, addr & 7, len);
 }
@@ -147,6 +207,142 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+/* We want to avoid outer shareable. */
+u64 vgic_sanitise_shareability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_OuterShareable:
+               return GIC_BASER_InnerShareable;
+       default:
+               return field;
+       }
+}
+
+/* Avoid any inner non-cacheable mapping. */
+u64 vgic_sanitise_inner_cacheability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_CACHE_nCnB:
+       case GIC_BASER_CACHE_nC:
+               return GIC_BASER_CACHE_RaWb;
+       default:
+               return field;
+       }
+}
+
+/* Non-cacheable or same-as-inner are OK. */
+u64 vgic_sanitise_outer_cacheability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_CACHE_SameAsInner:
+       case GIC_BASER_CACHE_nC:
+               return field;
+       default:
+               return GIC_BASER_CACHE_nC;
+       }
+}
+
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+                       u64 (*sanitise_fn)(u64))
+{
+       u64 field = (reg & field_mask) >> field_shift;
+
+       field = sanitise_fn(field) << field_shift;
+       return (reg & ~field_mask) | field;
+}
+
+#define PROPBASER_RES0_MASK                                            \
+       (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
+#define PENDBASER_RES0_MASK                                            \
+       (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |      \
+        GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
+
+static u64 vgic_sanitise_pendbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
+                                 GICR_PENDBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
+                                 GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
+                                 GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       reg &= ~PENDBASER_RES0_MASK;
+       reg &= ~GENMASK_ULL(51, 48);
+
+       return reg;
+}
+
+static u64 vgic_sanitise_propbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
+                                 GICR_PROPBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
+                                 GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
+                                 GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       reg &= ~PROPBASER_RES0_MASK;
+       reg &= ~GENMASK_ULL(51, 48);
+       return reg;
+}
+
+static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       return extract_bytes(dist->propbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       u64 propbaser = dist->propbaser;
+
+       /* Storing a value with LPIs already enabled is undefined */
+       if (vgic_cpu->lpis_enabled)
+               return;
+
+       propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
+       propbaser = vgic_sanitise_propbaser(propbaser);
+
+       dist->propbaser = propbaser;
+}
+
+static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       return extract_bytes(vgic_cpu->pendbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       u64 pendbaser = vgic_cpu->pendbaser;
+
+       /* Storing a value with LPIs already enabled is undefined */
+       if (vgic_cpu->lpis_enabled)
+               return;
+
+       pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
+       pendbaser = vgic_sanitise_pendbaser(pendbaser);
+
+       vgic_cpu->pendbaser = pendbaser;
+}
+
 /*
  * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
  * redistributors, while SPIs are covered by registers in the distributor
@@ -218,7 +414,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
 
 static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
        REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
                vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
@@ -227,10 +423,10 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
                vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
                vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
@@ -285,24 +481,18 @@ unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
 
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
 {
-       int nr_vcpus = atomic_read(&kvm->online_vcpus);
        struct kvm_vcpu *vcpu;
-       struct vgic_io_device *devices;
        int c, ret = 0;
 
-       devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2,
-                         GFP_KERNEL);
-       if (!devices)
-               return -ENOMEM;
-
        kvm_for_each_vcpu(c, vcpu, kvm) {
                gpa_t rd_base = redist_base_address + c * SZ_64K * 2;
                gpa_t sgi_base = rd_base + SZ_64K;
-               struct vgic_io_device *rd_dev = &devices[c * 2];
-               struct vgic_io_device *sgi_dev = &devices[c * 2 + 1];
+               struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+               struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev;
 
                kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
                rd_dev->base_addr = rd_base;
+               rd_dev->iodev_type = IODEV_REDIST;
                rd_dev->regions = vgic_v3_rdbase_registers;
                rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
                rd_dev->redist_vcpu = vcpu;
@@ -317,6 +507,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
 
                kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops);
                sgi_dev->base_addr = sgi_base;
+               sgi_dev->iodev_type = IODEV_REDIST;
                sgi_dev->regions = vgic_v3_sgibase_registers;
                sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers);
                sgi_dev->redist_vcpu = vcpu;
@@ -335,14 +526,15 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
        if (ret) {
                /* The current c failed, so we start with the previous one. */
                for (c--; c >= 0; c--) {
+                       struct vgic_cpu *vgic_cpu;
+
+                       vcpu = kvm_get_vcpu(kvm, c);
+                       vgic_cpu = &vcpu->arch.vgic_cpu;
                        kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-                                                 &devices[c * 2].dev);
+                                                 &vgic_cpu->rd_iodev.dev);
                        kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-                                                 &devices[c * 2 + 1].dev);
+                                                 &vgic_cpu->sgi_iodev.dev);
                }
-               kfree(devices);
-       } else {
-               kvm->arch.vgic.redist_iodevs = devices;
        }
 
        return ret;
@@ -451,5 +643,6 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
                irq->pending = true;
 
                vgic_queue_irq_unlock(vcpu->kvm, irq);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
index 9f6fab7..3bad3c5 100644 (file)
@@ -56,6 +56,8 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
 
                if (irq->enabled)
                        value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return value;
@@ -74,6 +76,8 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
                spin_lock(&irq->irq_lock);
                irq->enabled = true;
                vgic_queue_irq_unlock(vcpu->kvm, irq);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -92,6 +96,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
                irq->enabled = false;
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -108,6 +113,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
 
                if (irq->pending)
                        value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return value;
@@ -129,6 +136,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
                        irq->soft_pending = true;
 
                vgic_queue_irq_unlock(vcpu->kvm, irq);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -152,6 +160,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
                }
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -168,6 +177,8 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
 
                if (irq->active)
                        value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return value;
@@ -242,6 +253,7 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
                vgic_mmio_change_active(vcpu, irq, false);
+               vgic_put_irq(vcpu->kvm, irq);
        }
        vgic_change_active_finish(vcpu, intid);
 }
@@ -257,6 +269,7 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
                vgic_mmio_change_active(vcpu, irq, true);
+               vgic_put_irq(vcpu->kvm, irq);
        }
        vgic_change_active_finish(vcpu, intid);
 }
@@ -272,6 +285,8 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
                val |= (u64)irq->priority << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return val;
@@ -298,6 +313,8 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
                /* Narrow the priority range to what we actually support */
                irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
                spin_unlock(&irq->irq_lock);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -313,6 +330,8 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
 
                if (irq->config == VGIC_CONFIG_EDGE)
                        value |= (2U << (i * 2));
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return value;
@@ -326,7 +345,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
        int i;
 
        for (i = 0; i < len * 4; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               struct vgic_irq *irq;
 
                /*
                 * The configuration cannot be changed for SGIs in general,
@@ -337,14 +356,18 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
                if (intid + i < VGIC_NR_PRIVATE_IRQS)
                        continue;
 
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
                spin_lock(&irq->irq_lock);
+
                if (test_bit(i * 2 + 1, &val)) {
                        irq->config = VGIC_CONFIG_EDGE;
                } else {
                        irq->config = VGIC_CONFIG_LEVEL;
                        irq->pending = irq->line_level | irq->soft_pending;
                }
+
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -450,8 +473,7 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 {
        struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
-       struct kvm_vcpu *r_vcpu;
-       unsigned long data;
+       unsigned long data = 0;
 
        region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
                                       addr - iodev->base_addr);
@@ -460,8 +482,21 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                return 0;
        }
 
-       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-       data = region->read(r_vcpu, addr, len);
+       switch (iodev->iodev_type) {
+       case IODEV_CPUIF:
+               data = region->read(vcpu, addr, len);
+               break;
+       case IODEV_DIST:
+               data = region->read(vcpu, addr, len);
+               break;
+       case IODEV_REDIST:
+               data = region->read(iodev->redist_vcpu, addr, len);
+               break;
+       case IODEV_ITS:
+               data = region->its_read(vcpu->kvm, iodev->its, addr, len);
+               break;
+       }
+
        vgic_data_host_to_mmio_bus(val, len, data);
        return 0;
 }
@@ -471,7 +506,6 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 {
        struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
-       struct kvm_vcpu *r_vcpu;
        unsigned long data = vgic_data_mmio_bus_to_host(val, len);
 
        region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
@@ -482,8 +516,21 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
        if (!check_region(region, addr, len))
                return 0;
 
-       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-       region->write(r_vcpu, addr, len, data);
+       switch (iodev->iodev_type) {
+       case IODEV_CPUIF:
+               region->write(vcpu, addr, len, data);
+               break;
+       case IODEV_DIST:
+               region->write(vcpu, addr, len, data);
+               break;
+       case IODEV_REDIST:
+               region->write(iodev->redist_vcpu, addr, len, data);
+               break;
+       case IODEV_ITS:
+               region->its_write(vcpu->kvm, iodev->its, addr, len, data);
+               break;
+       }
+
        return 0;
 }
 
@@ -513,6 +560,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
        }
 
        io_device->base_addr = dist_base_address;
+       io_device->iodev_type = IODEV_DIST;
        io_device->redist_vcpu = NULL;
 
        mutex_lock(&kvm->slots_lock);
index 8509014..0b3ecf9 100644 (file)
@@ -21,10 +21,19 @@ struct vgic_register_region {
        unsigned int len;
        unsigned int bits_per_irq;
        unsigned int access_flags;
-       unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
-                             unsigned int len);
-       void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
-                     unsigned long val);
+       union {
+               unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+                                     unsigned int len);
+               unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
+                                         gpa_t addr, unsigned int len);
+       };
+       union {
+               void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
+                             unsigned int len, unsigned long val);
+               void (*its_write)(struct kvm *kvm, struct vgic_its *its,
+                                 gpa_t addr, unsigned int len,
+                                 unsigned long val);
+       };
 };
 
 extern struct kvm_io_device_ops kvm_io_gic_ops;
@@ -87,6 +96,12 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
 void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
                                unsigned long data);
 
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+                           unsigned int num);
+
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+                    unsigned long val);
+
 unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
                                 gpa_t addr, unsigned int len);
 
@@ -147,4 +162,12 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
 
 unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
 
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+u64 vgic_sanitise_outer_cacheability(u64 reg);
+u64 vgic_sanitise_inner_cacheability(u64 reg);
+u64 vgic_sanitise_shareability(u64 reg);
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+                       u64 (*sanitise_fn)(u64));
+#endif
+
 #endif
index e31405e..0bf6709 100644 (file)
@@ -124,6 +124,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
                }
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -332,20 +333,25 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
        vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
        kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
 
+       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+       if (ret) {
+               kvm_err("Cannot register GICv2 KVM device\n");
+               iounmap(kvm_vgic_global_state.vctrl_base);
+               return ret;
+       }
+
        ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
                                     kvm_vgic_global_state.vctrl_base +
                                         resource_size(&info->vctrl),
                                     info->vctrl.start);
-
        if (ret) {
                kvm_err("Cannot map VCTRL into hyp\n");
+               kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
                iounmap(kvm_vgic_global_state.vctrl_base);
                return ret;
        }
 
        kvm_vgic_global_state.can_emulate_gicv2 = true;
-       kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-
        kvm_vgic_global_state.vcpu_base = info->vcpu.start;
        kvm_vgic_global_state.type = VGIC_V2;
        kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
index 346b4ad..0506543 100644 (file)
@@ -81,6 +81,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
                else
                        intid = val & GICH_LR_VIRTUALID;
                irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+               if (!irq)       /* An LPI could have been unmapped. */
+                       continue;
 
                spin_lock(&irq->irq_lock);
 
@@ -113,6 +115,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
                }
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -190,6 +193,11 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
        vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
 }
 
+#define INITIAL_PENDBASER_VALUE                                                  \
+       (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)            | \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)      | \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
+
 void vgic_v3_enable(struct kvm_vcpu *vcpu)
 {
        struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -207,10 +215,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
         * way, so we force SRE to 1 to demonstrate this to the guest.
         * This goes with the spec allowing the value to be RAO/WI.
         */
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
                vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
-       else
+               vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
+       } else {
                vgic_v3->vgic_sre = 0;
+       }
 
        /* Get the show on the road... */
        vgic_v3->vgic_hcr = ICH_HCR_EN;
@@ -296,6 +306,7 @@ out:
 int vgic_v3_probe(const struct gic_kvm_info *info)
 {
        u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+       int ret;
 
        /*
         * The ListRegs field is 5 bits, but there is a architectural
@@ -319,12 +330,22 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
        } else {
                kvm_vgic_global_state.vcpu_base = info->vcpu.start;
                kvm_vgic_global_state.can_emulate_gicv2 = true;
-               kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+               ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+               if (ret) {
+                       kvm_err("Cannot register GICv2 KVM device.\n");
+                       return ret;
+               }
                kvm_info("vgic-v2@%llx\n", info->vcpu.start);
        }
+       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+       if (ret) {
+               kvm_err("Cannot register GICv3 KVM device.\n");
+               kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
+               return ret;
+       }
+
        if (kvm_vgic_global_state.vcpu_base == 0)
                kvm_info("disabling GICv2 emulation\n");
-       kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
 
        kvm_vgic_global_state.vctrl_base = NULL;
        kvm_vgic_global_state.type = VGIC_V3;
index 69b61ab..39f3358 100644 (file)
@@ -33,10 +33,17 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
 
 /*
  * Locking order is always:
- *   vgic_cpu->ap_list_lock
- *     vgic_irq->irq_lock
+ * its->cmd_lock (mutex)
+ *   its->its_lock (mutex)
+ *     vgic_cpu->ap_list_lock
+ *       kvm->lpi_list_lock
+ *         vgic_irq->irq_lock
  *
- * (that is, always take the ap_list_lock before the struct vgic_irq lock).
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
  *
  * When taking more than one ap_list_lock at the same time, always take the
  * lowest numbered VCPU's ap_list_lock first, so:
@@ -45,6 +52,41 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
  *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
  */
 
+/*
+ * Iterate over the VM's list of mapped LPIs to find the one with a
+ * matching interrupt ID and return a reference to the IRQ structure.
+ */
+static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq = NULL;
+
+       spin_lock(&dist->lpi_list_lock);
+
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               if (irq->intid != intid)
+                       continue;
+
+               /*
+                * This increases the refcount, the caller is expected to
+                * call vgic_put_irq() later once it's finished with the IRQ.
+                */
+               vgic_get_irq_kref(irq);
+               goto out_unlock;
+       }
+       irq = NULL;
+
+out_unlock:
+       spin_unlock(&dist->lpi_list_lock);
+
+       return irq;
+}
+
+/*
+ * This looks up the virtual interrupt ID to get the corresponding
+ * struct vgic_irq. It also increases the refcount, so any caller is expected
+ * to call vgic_put_irq() once it's finished with this IRQ.
+ */
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
                              u32 intid)
 {
@@ -56,14 +98,43 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
        if (intid <= VGIC_MAX_SPI)
                return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
 
-       /* LPIs are not yet covered */
+       /* LPIs */
        if (intid >= VGIC_MIN_LPI)
-               return NULL;
+               return vgic_get_lpi(kvm, intid);
 
        WARN(1, "Looking up struct vgic_irq for reserved INTID");
        return NULL;
 }
 
+/*
+ * We can't do anything in here, because we lack the kvm pointer to
+ * lock and remove the item from the lpi_list. So we keep this function
+ * empty and use the return value of kref_put() to trigger the freeing.
+ */
+static void vgic_irq_release(struct kref *ref)
+{
+}
+
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+       struct vgic_dist *dist;
+
+       if (irq->intid < VGIC_MIN_LPI)
+               return;
+
+       if (!kref_put(&irq->refcount, vgic_irq_release))
+               return;
+
+       dist = &kvm->arch.vgic;
+
+       spin_lock(&dist->lpi_list_lock);
+       list_del(&irq->lpi_list);
+       dist->lpi_list_count--;
+       spin_unlock(&dist->lpi_list_lock);
+
+       kfree(irq);
+}
+
 /**
  * kvm_vgic_target_oracle - compute the target vcpu for an irq
  *
@@ -236,6 +307,11 @@ retry:
                goto retry;
        }
 
+       /*
+        * Grab a reference to the irq to reflect the fact that it is
+        * now in the ap_list.
+        */
+       vgic_get_irq_kref(irq);
        list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
        irq->vcpu = vcpu;
 
@@ -269,14 +345,17 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
        if (!irq)
                return -EINVAL;
 
-       if (irq->hw != mapped_irq)
+       if (irq->hw != mapped_irq) {
+               vgic_put_irq(kvm, irq);
                return -EINVAL;
+       }
 
        spin_lock(&irq->irq_lock);
 
        if (!vgic_validate_injection(irq, level)) {
                /* Nothing to see here, move along... */
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(kvm, irq);
                return 0;
        }
 
@@ -288,6 +367,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
        }
 
        vgic_queue_irq_unlock(kvm, irq);
+       vgic_put_irq(kvm, irq);
 
        return 0;
 }
@@ -330,25 +410,28 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
        irq->hwintid = phys_irq;
 
        spin_unlock(&irq->irq_lock);
+       vgic_put_irq(vcpu->kvm, irq);
 
        return 0;
 }
 
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 {
-       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
-
-       BUG_ON(!irq);
+       struct vgic_irq *irq;
 
        if (!vgic_initialized(vcpu->kvm))
                return -EAGAIN;
 
+       irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+       BUG_ON(!irq);
+
        spin_lock(&irq->irq_lock);
 
        irq->hw = false;
        irq->hwintid = 0;
 
        spin_unlock(&irq->irq_lock);
+       vgic_put_irq(vcpu->kvm, irq);
 
        return 0;
 }
@@ -386,6 +469,15 @@ retry:
                        list_del(&irq->ap_list);
                        irq->vcpu = NULL;
                        spin_unlock(&irq->irq_lock);
+
+                       /*
+                        * This vgic_put_irq call matches the
+                        * vgic_get_irq_kref in vgic_queue_irq_unlock,
+                        * where we added the LPI to the ap_list. As
+                        * we remove the irq from the list, we drop
+                        * also drop the refcount.
+                        */
+                       vgic_put_irq(vcpu->kvm, irq);
                        continue;
                }
 
@@ -614,6 +706,15 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
        spin_lock(&irq->irq_lock);
        map_is_active = irq->hw && irq->active;
        spin_unlock(&irq->irq_lock);
+       vgic_put_irq(vcpu->kvm, irq);
 
        return map_is_active;
 }
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+       if (vgic_has_its(kvm))
+               return vgic_its_inject_msi(kvm, msi);
+       else
+               return -ENODEV;
+}
index 7b300ca..1d8e21d 100644 (file)
@@ -25,6 +25,7 @@
 #define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
 
 #define INTERRUPT_ID_BITS_SPIS 10
+#define INTERRUPT_ID_BITS_ITS  16
 #define VGIC_PRI_BITS          5
 
 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
@@ -38,9 +39,13 @@ struct vgic_vmcr {
 
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
                              u32 intid);
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
 void vgic_kick_vcpus(struct kvm *kvm);
 
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                     phys_addr_t addr, phys_addr_t alignment);
+
 void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
@@ -59,6 +64,14 @@ int vgic_v2_map_resources(struct kvm *kvm);
 int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
                             enum vgic_type);
 
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+       if (irq->intid < VGIC_MIN_LPI)
+               return;
+
+       kref_get(&irq->refcount);
+}
+
 #ifdef CONFIG_KVM_ARM_VGIC_V3
 void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
@@ -71,6 +84,10 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu);
 int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+bool vgic_has_its(struct kvm *kvm);
+int kvm_vgic_register_its_device(void);
+void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
 #else
 static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
 {
@@ -122,9 +139,28 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm,
 {
        return -ENODEV;
 }
+
+static inline bool vgic_has_its(struct kvm *kvm)
+{
+       return false;
+}
+
+static inline int kvm_vgic_register_its_device(void)
+{
+       return -ENODEV;
+}
+
+static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+       return -ENODEV;
+}
 #endif
 
-void kvm_register_vgic_device(unsigned long type);
+int kvm_register_vgic_device(unsigned long type);
 int vgic_lazy_init(struct kvm *kvm);
 int vgic_init(struct kvm *kvm);
 
index 8db197b..df99e9c 100644 (file)
@@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm)
        free_irq_routing_table(rt);
 }
 
-static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+static int setup_routing_entry(struct kvm *kvm,
+                              struct kvm_irq_routing_table *rt,
                               struct kvm_kernel_irq_routing_entry *e,
                               const struct kvm_irq_routing_entry *ue)
 {
@@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 
        e->gsi = ue->gsi;
        e->type = ue->type;
-       r = kvm_set_routing_entry(e, ue);
+       r = kvm_set_routing_entry(kvm, e, ue);
        if (r)
                goto out;
        if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
@@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        kfree(e);
                        goto out;
                }
-               r = setup_routing_entry(new, e, ue);
+               r = setup_routing_entry(kvm, new, e, ue);
                if (r) {
                        kfree(e);
                        goto out;
index 2e79136..cc081cc 100644 (file)
@@ -1444,6 +1444,52 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
        return true;
 }
 
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+                              unsigned long addr, bool *async,
+                              bool write_fault, kvm_pfn_t *p_pfn)
+{
+       unsigned long pfn;
+       int r;
+
+       r = follow_pfn(vma, addr, &pfn);
+       if (r) {
+               /*
+                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+                * not call the fault handler, so do it here.
+                */
+               bool unlocked = false;
+               r = fixup_user_fault(current, current->mm, addr,
+                                    (write_fault ? FAULT_FLAG_WRITE : 0),
+                                    &unlocked);
+               if (unlocked)
+                       return -EAGAIN;
+               if (r)
+                       return r;
+
+               r = follow_pfn(vma, addr, &pfn);
+               if (r)
+                       return r;
+
+       }
+
+
+       /*
+        * Get a reference here because callers of *hva_to_pfn* and
+        * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+        * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+        * simply do nothing for reserved pfns.
+        *
+        * Whoever called remap_pfn_range is also going to call e.g.
+        * unmap_mapping_range before the underlying pages are freed,
+        * causing a call to our MMU notifier.
+        */ 
+       kvm_get_pfn(pfn);
+
+       *p_pfn = pfn;
+       return 0;
+}
+
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
@@ -1463,7 +1509,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 {
        struct vm_area_struct *vma;
        kvm_pfn_t pfn = 0;
-       int npages;
+       int npages, r;
 
        /* we can do it either atomically or asynchronously, not both */
        BUG_ON(atomic && async);
@@ -1485,14 +1531,17 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
                goto exit;
        }
 
+retry:
        vma = find_vma_intersection(current->mm, addr, addr + 1);
 
        if (vma == NULL)
                pfn = KVM_PFN_ERR_FAULT;
-       else if ((vma->vm_flags & VM_PFNMAP)) {
-               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                       vma->vm_pgoff;
-               BUG_ON(!kvm_is_reserved_pfn(pfn));
+       else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+               r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+               if (r == -EAGAIN)
+                       goto retry;
+               if (r < 0)
+                       pfn = KVM_PFN_ERR_FAULT;
        } else {
                if (async && vma_is_valid(vma, write_fault))
                        *async = true;
@@ -2348,9 +2397,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        if (id >= KVM_MAX_VCPU_ID)
                return -EINVAL;
 
+       mutex_lock(&kvm->lock);
+       if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+               mutex_unlock(&kvm->lock);
+               return -EINVAL;
+       }
+
+       kvm->created_vcpus++;
+       mutex_unlock(&kvm->lock);
+
        vcpu = kvm_arch_vcpu_create(kvm, id);
-       if (IS_ERR(vcpu))
-               return PTR_ERR(vcpu);
+       if (IS_ERR(vcpu)) {
+               r = PTR_ERR(vcpu);
+               goto vcpu_decrement;
+       }
 
        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 
@@ -2359,14 +2419,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
                goto vcpu_destroy;
 
        mutex_lock(&kvm->lock);
-       if (!kvm_vcpu_compatible(vcpu)) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
-       if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
        if (kvm_get_vcpu_by_id(kvm, id)) {
                r = -EEXIST;
                goto unlock_vcpu_destroy;
@@ -2399,6 +2451,10 @@ unlock_vcpu_destroy:
        mutex_unlock(&kvm->lock);
 vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
+vcpu_decrement:
+       mutex_lock(&kvm->lock);
+       kvm->created_vcpus--;
+       mutex_unlock(&kvm->lock);
        return r;
 }
 
@@ -3487,6 +3543,30 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
        return r;
 }
 
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                                        gpa_t addr)
+{
+       struct kvm_io_bus *bus;
+       int dev_idx, srcu_idx;
+       struct kvm_io_device *iodev = NULL;
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+
+       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+
+       dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+       if (dev_idx < 0)
+               goto out_unlock;
+
+       iodev = bus->range[dev_idx].dev;
+
+out_unlock:
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+       return iodev;
+}
+EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+
 static int kvm_debugfs_open(struct inode *inode, struct file *file,
                           int (*get)(void *, u64 *), int (*set)(void *, u64),
                           const char *fmt)