Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/ide
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Nov 2013 05:17:43 +0000 (14:17 +0900)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Nov 2013 05:17:43 +0000 (14:17 +0900)
Pull IDE updates from David Miller:
 "Just some minor cleanups and simplifications"

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/ide:
  ide: pmac: remove unnecessary pci_set_drvdata()
  ide: cs5536: use module_pci_driver()
  ide: pmac: Remove casting the return value which is a void pointer

497 files changed:
Documentation/virtual/kvm/00-INDEX [new file with mode: 0644]
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/cpuid.txt
Documentation/virtual/kvm/devices/vfio.txt [new file with mode: 0644]
Documentation/virtual/kvm/locking.txt
Documentation/vm/split_page_table_lock [new file with mode: 0644]
MAINTAINERS
arch/Kconfig
arch/alpha/Kconfig
arch/alpha/include/asm/pgalloc.h
arch/arc/Kconfig
arch/arc/include/asm/pgalloc.h
arch/arm/Kconfig
arch/arm/include/asm/dma-mapping.h
arch/arm/include/asm/io.h
arch/arm/include/asm/kvm_arm.h
arch/arm/include/asm/kvm_asm.h
arch/arm/include/asm/kvm_emulate.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_mmu.h
arch/arm/include/asm/pgalloc.h
arch/arm/include/asm/pgtable-3level.h
arch/arm/include/asm/xen/hypervisor.h
arch/arm/include/asm/xen/page-coherent.h [new file with mode: 0644]
arch/arm/include/asm/xen/page.h
arch/arm/include/uapi/asm/kvm.h
arch/arm/kvm/Kconfig
arch/arm/kvm/Makefile
arch/arm/kvm/arm.c
arch/arm/kvm/coproc.c
arch/arm/kvm/coproc_a15.c
arch/arm/kvm/coproc_a7.c [new file with mode: 0644]
arch/arm/kvm/emulate.c
arch/arm/kvm/guest.c
arch/arm/kvm/handle_exit.c
arch/arm/kvm/mmio.c
arch/arm/kvm/mmu.c
arch/arm/kvm/psci.c
arch/arm/kvm/reset.c
arch/arm/mach-tegra/apbio.c
arch/arm/mm/fault-armv.c
arch/arm/xen/Makefile
arch/arm/xen/mm.c [new file with mode: 0644]
arch/arm/xen/p2m.c [new file with mode: 0644]
arch/arm64/Kconfig
arch/arm64/include/asm/dma-mapping.h
arch/arm64/include/asm/io.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_emulate.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/pgalloc.h
arch/arm64/include/asm/pgtable-hwdef.h
arch/arm64/include/asm/xen/page-coherent.h [new file with mode: 0644]
arch/arm64/kvm/Kconfig
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/xen/Makefile
arch/avr32/include/asm/pgalloc.h
arch/blackfin/Kconfig
arch/cris/include/asm/pgalloc.h
arch/frv/mm/pgalloc.c
arch/hexagon/Kconfig
arch/hexagon/include/asm/pgalloc.h
arch/ia64/Kconfig
arch/ia64/include/asm/kvm_host.h
arch/ia64/include/asm/pgalloc.h
arch/ia64/include/asm/xen/page-coherent.h [new file with mode: 0644]
arch/ia64/kvm/kvm-ia64.c
arch/m32r/Kconfig
arch/m32r/include/asm/pgalloc.h
arch/m68k/include/asm/mcf_pgalloc.h
arch/m68k/include/asm/motorola_pgalloc.h
arch/m68k/include/asm/sun3_pgalloc.h
arch/metag/Kconfig
arch/metag/include/asm/pgalloc.h
arch/microblaze/include/asm/pgalloc.h
arch/mips/Kconfig
arch/mips/include/asm/kvm_host.h
arch/mips/include/asm/pgalloc.h
arch/mips/kvm/kvm_mips.c
arch/mn10300/Kconfig
arch/mn10300/include/asm/pgalloc.h
arch/mn10300/mm/pgtable.c
arch/openrisc/include/asm/pgalloc.h
arch/parisc/Kconfig
arch/parisc/include/asm/pgalloc.h
arch/powerpc/Kconfig
arch/powerpc/include/asm/disassemble.h
arch/powerpc/include/asm/exception-64s.h
arch/powerpc/include/asm/kvm_asm.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_32.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/paca.h
arch/powerpc/include/asm/pgalloc-64.h
arch/powerpc/include/asm/processor.h
arch/powerpc/include/asm/pte-book3e.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/uapi/asm/kvm.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/idle_power7.S
arch/powerpc/kernel/traps.c
arch/powerpc/kvm/44x.c
arch/powerpc/kvm/44x_emulate.c
arch/powerpc/kvm/44x_tlb.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s.h [new file with mode: 0644]
arch/powerpc/kvm/book3s_32_mmu.c
arch/powerpc/kvm/book3s_32_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu.c
arch/powerpc/kvm/book3s_64_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_vio_hv.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_exports.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_interrupts.S
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_interrupts.S
arch/powerpc/kvm/book3s_mmu_hpte.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/book3s_pr_papr.c
arch/powerpc/kvm/book3s_rmhandlers.S
arch/powerpc/kvm/book3s_rtas.c
arch/powerpc/kvm/book3s_segment.S
arch/powerpc/kvm/book3s_xics.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/booke.h
arch/powerpc/kvm/e500.c
arch/powerpc/kvm/e500.h
arch/powerpc/kvm/e500_emulate.c
arch/powerpc/kvm/e500_mmu.c
arch/powerpc/kvm/e500_mmu_host.c
arch/powerpc/kvm/e500mc.c
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/trace.h
arch/powerpc/kvm/trace_booke.h [new file with mode: 0644]
arch/powerpc/kvm/trace_pr.h [new file with mode: 0644]
arch/powerpc/mm/pgtable_32.c
arch/powerpc/mm/pgtable_64.c
arch/powerpc/platforms/powermac/low_i2c.c
arch/powerpc/platforms/pseries/suspend.c
arch/s390/Kconfig
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/mm/pgtable.c
arch/score/include/asm/pgalloc.h
arch/sh/Kconfig
arch/sh/include/asm/pgalloc.h
arch/sparc/Kconfig
arch/sparc/include/asm/mmu_64.h
arch/sparc/include/asm/page_64.h
arch/sparc/include/asm/pgtable_64.h
arch/sparc/include/asm/sparsemem.h
arch/sparc/include/asm/thread_info_64.h
arch/sparc/include/asm/tsb.h
arch/sparc/kernel/entry.h
arch/sparc/kernel/kgdb_64.c
arch/sparc/kernel/kprobes.c
arch/sparc/kernel/ktlb.S
arch/sparc/kernel/pci.c
arch/sparc/kernel/process_64.c
arch/sparc/kernel/ptrace_64.c
arch/sparc/kernel/rtrap_64.S
arch/sparc/kernel/signal_64.c
arch/sparc/kernel/smp_64.c
arch/sparc/kernel/sun4v_tlb_miss.S
arch/sparc/kernel/sys_sparc_64.c
arch/sparc/kernel/syscalls.S
arch/sparc/kernel/traps_64.c
arch/sparc/kernel/tsb.S
arch/sparc/kernel/unaligned_64.c
arch/sparc/kernel/vmlinux.lds.S
arch/sparc/lib/clear_page.S
arch/sparc/lib/copy_page.S
arch/sparc/mm/fault_64.c
arch/sparc/mm/gup.c
arch/sparc/mm/hugetlbpage.c
arch/sparc/mm/init_64.c
arch/sparc/mm/init_64.h
arch/sparc/mm/srmmu.c
arch/sparc/mm/tlb.c
arch/sparc/mm/tsb.c
arch/sparc/mm/ultra.S
arch/tile/Kconfig
arch/tile/mm/pgtable.c
arch/um/kernel/mem.c
arch/unicore32/include/asm/pgalloc.h
arch/x86/Kconfig
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/pgalloc.h
arch/x86/include/asm/pvclock.h
arch/x86/include/asm/xen/page-coherent.h [new file with mode: 0644]
arch/x86/include/uapi/asm/kvm.h
arch/x86/include/uapi/asm/msr-index.h
arch/x86/kernel/kvmclock.c
arch/x86/kernel/pvclock.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/mm/pgtable.c
arch/x86/xen/mmu.c
arch/x86/xen/p2m.c
arch/x86/xen/pci-swiotlb-xen.c
arch/x86/xen/setup.c
arch/x86/xen/smp.c
arch/x86/xen/spinlock.c
arch/x86/xen/time.c
arch/xtensa/include/asm/pgalloc.h
arch/xtensa/include/asm/pgtable.h
arch/xtensa/mm/mmu.c
block/blk-mq.c
block/blk-softirq.c
block/blk-sysfs.c
crypto/af_alg.c
crypto/tcrypt.c
crypto/testmgr.c
drivers/ata/libata-eh.c
drivers/base/power/main.c
drivers/block/amiflop.c
drivers/block/cciss.c
drivers/block/virtio_blk.c
drivers/char/hw_random/timeriomem-rng.c
drivers/char/hw_random/virtio-rng.c
drivers/char/virtio_console.c
drivers/crypto/tegra-aes.c
drivers/firewire/core-transaction.c
drivers/gpu/drm/drm_flip_work.c
drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c
drivers/hid/hid-wiimote.h
drivers/hwmon/jz4740-hwmon.c
drivers/i2c/busses/i2c-at91.c
drivers/i2c/busses/i2c-bcm2835.c
drivers/i2c/busses/i2c-davinci.c
drivers/i2c/busses/i2c-designware-core.c
drivers/i2c/busses/i2c-ismt.c
drivers/i2c/busses/i2c-mxs.c
drivers/i2c/busses/i2c-omap.c
drivers/i2c/busses/i2c-tegra.c
drivers/i2c/busses/i2c-wmt.c
drivers/iio/adc/ad_sigma_delta.c
drivers/iio/adc/nau7802.c
drivers/iio/industrialio-event.c
drivers/input/touchscreen/cyttsp_core.c
drivers/iommu/Kconfig
drivers/iommu/Makefile
drivers/iommu/arm-smmu.c
drivers/iommu/dmar.c
drivers/iommu/intel-iommu.c
drivers/iommu/intel_irq_remapping.c
drivers/iommu/iommu-traces.c [new file with mode: 0644]
drivers/iommu/iommu.c
drivers/iommu/tegra-gart.c
drivers/iommu/tegra-smmu.c
drivers/lguest/lguest_device.c
drivers/lguest/x86/core.c
drivers/md/dm-crypt.c
drivers/md/raid5.c
drivers/media/platform/blackfin/bfin_capture.c
drivers/media/radio/radio-wl1273.c
drivers/media/radio/si470x/radio-si470x-common.c
drivers/media/rc/iguanair.c
drivers/memstick/core/memstick.c
drivers/memstick/core/ms_block.c
drivers/memstick/core/ms_block.h
drivers/memstick/host/r592.c
drivers/misc/mic/card/mic_virtio.c
drivers/misc/mic/host/mic_boot.c
drivers/misc/ti-st/st_kim.c
drivers/mtd/nand/mxc_nand.c
drivers/mtd/nand/r852.c
drivers/mtd/onenand/omap2.c
drivers/net/caif/caif_virtio.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
drivers/net/ieee802154/at86rf230.c
drivers/net/ieee802154/mrf24j40.c
drivers/net/virtio_net.c
drivers/net/wireless/ath/ath10k/htc.c
drivers/net/wireless/ath/ath10k/mac.c
drivers/net/wireless/ath/carl9170/usb.c
drivers/net/wireless/ath/wil6210/main.c
drivers/net/wireless/brcm80211/brcmfmac/p2p.c
drivers/net/wireless/rt2x00/rt2800mmio.c
drivers/net/wireless/rt2x00/rt2800usb.c
drivers/net/wireless/zd1211rw/zd_usb.c
drivers/parport/parport_ip32.c
drivers/pci/pcie/aer/aerdrv_core.c
drivers/platform/x86/apple-gmux.c
drivers/power/ab8500_fg.c
drivers/power/jz4740-battery.c
drivers/remoteproc/remoteproc_virtio.c
drivers/rtc/rtc-hid-sensor-time.c
drivers/s390/kvm/kvm_virtio.c
drivers/s390/kvm/virtio_ccw.c
drivers/scsi/virtio_scsi.c
drivers/spi/spi-bcm2835.c
drivers/spi/spi-clps711x.c
drivers/spi/spi-davinci.c
drivers/spi/spi-fsl-espi.c
drivers/spi/spi-fsl-spi.c
drivers/spi/spi-mpc512x-psc.c
drivers/spi/spi-mxs.c
drivers/spi/spi-s3c64xx.c
drivers/spi/spi-sh-msiof.c
drivers/spi/spi-sirf.c
drivers/spi/spi-tegra114.c
drivers/spi/spi-tegra20-sflash.c
drivers/spi/spi-tegra20-slink.c
drivers/spi/spi-xilinx.c
drivers/spi/spi.c
drivers/staging/iio/adc/mxs-lradc.c
drivers/staging/media/solo6x10/solo6x10-p2m.c
drivers/staging/tidspbridge/core/sync.c
drivers/staging/tidspbridge/include/dspbridge/sync.h
drivers/staging/tidspbridge/rmgr/drv_interface.c
drivers/tty/hvc/hvc_xen.c
drivers/tty/metag_da.c
drivers/usb/c67x00/c67x00-sched.c
drivers/usb/gadget/f_fs.c
drivers/usb/serial/mos7720.c
drivers/video/exynos/exynos_mipi_dsi_common.c
drivers/video/omap2/displays-new/encoder-tpd12s015.c
drivers/virtio/virtio_balloon.c
drivers/virtio/virtio_mmio.c
drivers/virtio/virtio_pci.c
drivers/virtio/virtio_ring.c
drivers/w1/masters/w1-gpio.c
drivers/xen/Kconfig
drivers/xen/balloon.c
drivers/xen/evtchn.c
drivers/xen/grant-table.c
drivers/xen/pci.c
drivers/xen/platform-pci.c
drivers/xen/swiotlb-xen.c
fs/btrfs/Makefile
fs/btrfs/acl.c
fs/btrfs/async-thread.c
fs/btrfs/backref.c
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c
fs/btrfs/compat.h [deleted file]
fs/btrfs/compression.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/dev-replace.c
fs/btrfs/dir-item.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/export.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.h
fs/btrfs/file-item.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/free-space-cache.h
fs/btrfs/inode-item.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/print-tree.c
fs/btrfs/raid56.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/super.c
fs/btrfs/tests/btrfs-tests.c [new file with mode: 0644]
fs/btrfs/tests/btrfs-tests.h
fs/btrfs/tests/extent-buffer-tests.c [new file with mode: 0644]
fs/btrfs/tests/extent-io-tests.c [new file with mode: 0644]
fs/btrfs/tests/inode-tests.c [new file with mode: 0644]
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-defrag.c
fs/btrfs/tree-log.c
fs/btrfs/uuid-tree.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/ecryptfs/crypto.c
fs/hfsplus/xattr.c
fs/nfs/nfs4state.c
fs/ocfs2/dlmglue.c
fs/proc/consoles.c
fs/proc/meminfo.c
fs/proc/nommu.c
fs/proc/task_mmu.c
fs/proc/task_nommu.c
fs/seq_file.c
include/asm-generic/vmlinux.lds.h
include/linux/cmdline-parser.h
include/linux/completion.h
include/linux/export.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/interrupt.h
include/linux/iommu.h
include/linux/kfifo.h
include/linux/kvm_host.h
include/linux/llist.h
include/linux/lockref.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/module.h
include/linux/sched.h
include/linux/seq_file.h
include/linux/smp.h
include/linux/srcu.h
include/linux/swapops.h
include/linux/virtio.h
include/linux/virtio_config.h
include/linux/virtio_ring.h
include/trace/events/iommu.h [new file with mode: 0644]
include/trace/events/kvm.h
include/trace/events/swiotlb.h [new file with mode: 0644]
include/uapi/linux/kvm.h
include/uapi/linux/magic.h
include/xen/interface/physdev.h
include/xen/swiotlb-xen.h
include/xen/xen-ops.h
init/main.c
kernel/Kconfig.hz
kernel/bounds.c
kernel/fork.c
kernel/hung_task.c
kernel/module.c
kernel/smp.c
kernel/softirq.c
kernel/up.c
lib/Kconfig
lib/kfifo.c
lib/llist.c
lib/lockref.c
lib/swiotlb.c
lib/vsprintf.c
mm/Kconfig
mm/filemap.c
mm/huge_memory.c
mm/hugetlb.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/mempolicy.c
mm/migrate.c
mm/mmap.c
mm/oom_kill.c
mm/pgtable-generic.c
mm/rmap.c
net/9p/trans_virtio.c
net/ipv4/fib_trie.c
net/ipv4/ping.c
net/ipv4/tcp_ipv4.c
net/ipv4/udp.c
net/phonet/socket.c
net/sctp/objcnt.c
samples/kfifo/bytestream-example.c
samples/kfifo/dma-example.c
samples/kfifo/inttype-example.c
scripts/Makefile.modpost
scripts/mod/modpost.c
sound/core/memalloc.c
sound/firewire/dice.c
sound/soc/samsung/ac97.c
tools/virtio/virtio_test.c
tools/virtio/vringh_test.c
virt/kvm/Kconfig
virt/kvm/async_pf.c
virt/kvm/iommu.c
virt/kvm/kvm_main.c
virt/kvm/vfio.c [new file with mode: 0644]

diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX
new file mode 100644 (file)
index 0000000..641ec92
--- /dev/null
@@ -0,0 +1,24 @@
+00-INDEX
+       - this file.
+api.txt
+       - KVM userspace API.
+cpuid.txt
+       - KVM-specific cpuid leaves (x86).
+devices/
+       - KVM_CAP_DEVICE_CTRL userspace API.
+hypercalls.txt
+       - KVM hypercalls.
+locking.txt
+       - notes on KVM locks.
+mmu.txt
+       - the x86 kvm shadow mmu.
+msr.txt
+       - KVM-specific MSRs (x86).
+nested-vmx.txt
+       - notes on nested virtualization for Intel x86 processors.
+ppc-pv.txt
+       - the paravirtualization interface on PowerPC.
+review-checklist.txt
+       - review checklist for KVM patches.
+timekeeping.txt
+       - timekeeping virtualization for x86-based architectures.
index 858aecf..a30035d 100644 (file)
@@ -1122,9 +1122,9 @@ struct kvm_cpuid2 {
        struct kvm_cpuid_entry2 entries[0];
 };
 
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
-#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
-#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX                BIT(0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC           BIT(1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT         BIT(2)
 
 struct kvm_cpuid_entry2 {
        __u32 function;
@@ -1810,6 +1810,50 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_TLB3PS   | 32
   PPC   | KVM_REG_PPC_EPTCFG   | 32
   PPC   | KVM_REG_PPC_ICP_STATE | 64
+  PPC   | KVM_REG_PPC_TB_OFFSET        | 64
+  PPC   | KVM_REG_PPC_SPMC1    | 32
+  PPC   | KVM_REG_PPC_SPMC2    | 32
+  PPC   | KVM_REG_PPC_IAMR     | 64
+  PPC   | KVM_REG_PPC_TFHAR    | 64
+  PPC   | KVM_REG_PPC_TFIAR    | 64
+  PPC   | KVM_REG_PPC_TEXASR   | 64
+  PPC   | KVM_REG_PPC_FSCR     | 64
+  PPC   | KVM_REG_PPC_PSPB     | 32
+  PPC   | KVM_REG_PPC_EBBHR    | 64
+  PPC   | KVM_REG_PPC_EBBRR    | 64
+  PPC   | KVM_REG_PPC_BESCR    | 64
+  PPC   | KVM_REG_PPC_TAR      | 64
+  PPC   | KVM_REG_PPC_DPDES    | 64
+  PPC   | KVM_REG_PPC_DAWR     | 64
+  PPC   | KVM_REG_PPC_DAWRX    | 64
+  PPC   | KVM_REG_PPC_CIABR    | 64
+  PPC   | KVM_REG_PPC_IC       | 64
+  PPC   | KVM_REG_PPC_VTB      | 64
+  PPC   | KVM_REG_PPC_CSIGR    | 64
+  PPC   | KVM_REG_PPC_TACR     | 64
+  PPC   | KVM_REG_PPC_TCSCR    | 64
+  PPC   | KVM_REG_PPC_PID      | 64
+  PPC   | KVM_REG_PPC_ACOP     | 64
+  PPC   | KVM_REG_PPC_VRSAVE   | 32
+  PPC   | KVM_REG_PPC_LPCR     | 64
+  PPC   | KVM_REG_PPC_PPR      | 64
+  PPC   | KVM_REG_PPC_ARCH_COMPAT 32
+  PPC   | KVM_REG_PPC_TM_GPR0  | 64
+          ...
+  PPC   | KVM_REG_PPC_TM_GPR31 | 64
+  PPC   | KVM_REG_PPC_TM_VSR0  | 128
+          ...
+  PPC   | KVM_REG_PPC_TM_VSR63 | 128
+  PPC   | KVM_REG_PPC_TM_CR    | 64
+  PPC   | KVM_REG_PPC_TM_LR    | 64
+  PPC   | KVM_REG_PPC_TM_CTR   | 64
+  PPC   | KVM_REG_PPC_TM_FPSCR | 64
+  PPC   | KVM_REG_PPC_TM_AMR   | 64
+  PPC   | KVM_REG_PPC_TM_PPR   | 64
+  PPC   | KVM_REG_PPC_TM_VRSAVE        | 64
+  PPC   | KVM_REG_PPC_TM_VSCR  | 32
+  PPC   | KVM_REG_PPC_TM_DSCR  | 64
+  PPC   | KVM_REG_PPC_TM_TAR   | 64
 
 ARM registers are mapped using the lower 32 bits.  The upper 16 of that
 is the register group type, or coprocessor number:
@@ -2304,7 +2348,31 @@ Possible features:
          Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only).
 
 
-4.83 KVM_GET_REG_LIST
+4.83 KVM_ARM_PREFERRED_TARGET
+
+Capability: basic
+Architectures: arm, arm64
+Type: vm ioctl
+Parameters: struct struct kvm_vcpu_init (out)
+Returns: 0 on success; -1 on error
+Errors:
+  ENODEV:    no preferred target available for the host
+
+This queries KVM for preferred CPU target type which can be emulated
+by KVM on underlying host.
+
+The ioctl returns struct kvm_vcpu_init instance containing information
+about preferred CPU target type and recommended features for it.  The
+kvm_vcpu_init->features bitmap returned will have feature bits set if
+the preferred target recommends setting these features, but this is
+not mandatory.
+
+The information returned by this ioctl can be used to prepare an instance
+of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in
+in VCPU matching underlying host.
+
+
+4.84 KVM_GET_REG_LIST
 
 Capability: basic
 Architectures: arm, arm64
@@ -2323,8 +2391,7 @@ struct kvm_reg_list {
 This ioctl returns the guest registers that are supported for the
 KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
 
-
-4.84 KVM_ARM_SET_DEVICE_ADDR
+4.85 KVM_ARM_SET_DEVICE_ADDR
 
 Capability: KVM_CAP_ARM_SET_DEVICE_ADDR
 Architectures: arm, arm64
@@ -2362,7 +2429,7 @@ must be called after calling KVM_CREATE_IRQCHIP, but before calling
 KVM_RUN on any of the VCPUs.  Calling this ioctl twice for any of the
 base addresses will return -EEXIST.
 
-4.85 KVM_PPC_RTAS_DEFINE_TOKEN
+4.86 KVM_PPC_RTAS_DEFINE_TOKEN
 
 Capability: KVM_CAP_PPC_RTAS
 Architectures: ppc
@@ -2661,6 +2728,77 @@ and usually define the validity of a groups of registers. (e.g. one bit
 };
 
 
+4.81 KVM_GET_EMULATED_CPUID
+
+Capability: KVM_CAP_EXT_EMUL_CPUID
+Architectures: x86
+Type: system ioctl
+Parameters: struct kvm_cpuid2 (in/out)
+Returns: 0 on success, -1 on error
+
+struct kvm_cpuid2 {
+       __u32 nent;
+       __u32 flags;
+       struct kvm_cpuid_entry2 entries[0];
+};
+
+The member 'flags' is used for passing flags from userspace.
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX                BIT(0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC           BIT(1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT         BIT(2)
+
+struct kvm_cpuid_entry2 {
+       __u32 function;
+       __u32 index;
+       __u32 flags;
+       __u32 eax;
+       __u32 ebx;
+       __u32 ecx;
+       __u32 edx;
+       __u32 padding[3];
+};
+
+This ioctl returns x86 cpuid features which are emulated by
+kvm.Userspace can use the information returned by this ioctl to query
+which features are emulated by kvm instead of being present natively.
+
+Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2
+structure with the 'nent' field indicating the number of entries in
+the variable-size array 'entries'. If the number of entries is too low
+to describe the cpu capabilities, an error (E2BIG) is returned. If the
+number is too high, the 'nent' field is adjusted and an error (ENOMEM)
+is returned. If the number is just right, the 'nent' field is adjusted
+to the number of valid entries in the 'entries' array, which is then
+filled.
+
+The entries returned are the set CPUID bits of the respective features
+which kvm emulates, as returned by the CPUID instruction, with unknown
+or unsupported feature bits cleared.
+
+Features like x2apic, for example, may not be present in the host cpu
+but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be
+emulated efficiently and thus not included here.
+
+The fields in each entry are defined as follows:
+
+  function: the eax value used to obtain the entry
+  index: the ecx value used to obtain the entry (for entries that are
+         affected by ecx)
+  flags: an OR of zero or more of the following:
+        KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
+           if the index field is valid
+        KVM_CPUID_FLAG_STATEFUL_FUNC:
+           if cpuid for this function returns different values for successive
+           invocations; there will be several entries with the same function,
+           all with this flag set
+        KVM_CPUID_FLAG_STATE_READ_NEXT:
+           for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
+           the first entry to be read by a cpu
+   eax, ebx, ecx, edx: the values returned by the cpuid instruction for
+         this function/index combination
+
+
 6. Capabilities that can be enabled
 -----------------------------------
 
index 22ff659..3c65feb 100644 (file)
@@ -43,6 +43,13 @@ KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
 KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
                                    ||       || writing to msr 0x4b564d02
 ------------------------------------------------------------------------------
+KVM_FEATURE_STEAL_TIME             ||     5 || steal time can be enabled by
+                                   ||       || writing to msr 0x4b564d03.
+------------------------------------------------------------------------------
+KVM_FEATURE_PV_EOI                 ||     6 || paravirtualized end of interrupt
+                                   ||       || handler can be enabled by writing
+                                   ||       || to msr 0x4b564d04.
+------------------------------------------------------------------------------
 KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
                                    ||       || before enabling paravirtualized
                                    ||       || spinlock support.
diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt
new file mode 100644 (file)
index 0000000..ef51740
--- /dev/null
@@ -0,0 +1,22 @@
+VFIO virtual device
+===================
+
+Device types supported:
+  KVM_DEV_TYPE_VFIO
+
+Only one VFIO instance may be created per VM.  The created device
+tracks VFIO groups in use by the VM and features of those groups
+important to the correctness and acceleration of the VM.  As groups
+are enabled and disabled for use by the VM, KVM should be updated
+about their presence.  When registered with KVM, a reference to the
+VFIO-group is held by KVM.
+
+Groups:
+  KVM_DEV_VFIO_GROUP
+
+KVM_DEV_VFIO_GROUP attributes:
+  KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
+  KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
+
+For each, kvm_device_attr.addr points to an int32_t file descriptor
+for the VFIO group.
index 41b7ac9..f886941 100644 (file)
@@ -132,10 +132,14 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
 ------------
 
 Name:          kvm_lock
-Type:          raw_spinlock
+Type:          spinlock_t
 Arch:          any
 Protects:      - vm_list
-               - hardware virtualization enable/disable
+
+Name:          kvm_count_lock
+Type:          raw_spinlock_t
+Arch:          any
+Protects:      - hardware virtualization enable/disable
 Comment:       'raw' because hardware enabling/disabling must be atomic /wrt
                migration.
 
@@ -151,3 +155,14 @@ Type:              spinlock_t
 Arch:          any
 Protects:      -shadow page/shadow tlb entry
 Comment:       it is a spinlock since it is used in mmu notifier.
+
+Name:          kvm->srcu
+Type:          srcu lock
+Arch:          any
+Protects:      - kvm->memslots
+               - kvm->buses
+Comment:       The srcu read lock must be held while accessing memslots (e.g.
+               when using gfn_to_* functions) and while accessing in-kernel
+               MMIO/PIO address->device structure mapping (kvm->buses).
+               The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
+               if it is needed by multiple functions.
diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock
new file mode 100644 (file)
index 0000000..7521d36
--- /dev/null
@@ -0,0 +1,94 @@
+Split page table lock
+=====================
+
+Originally, mm->page_table_lock spinlock protected all page tables of the
+mm_struct. But this approach leads to poor page fault scalability of
+multi-threaded applications due high contention on the lock. To improve
+scalability, split page table lock was introduced.
+
+With split page table lock we have separate per-table lock to serialize
+access to the table. At the moment we use split lock for PTE and PMD
+tables. Access to higher level tables protected by mm->page_table_lock.
+
+There are helpers to lock/unlock a table and other accessor functions:
+ - pte_offset_map_lock()
+       maps pte and takes PTE table lock, returns pointer to the taken
+       lock;
+ - pte_unmap_unlock()
+       unlocks and unmaps PTE table;
+ - pte_alloc_map_lock()
+       allocates PTE table if needed and take the lock, returns pointer
+       to taken lock or NULL if allocation failed;
+ - pte_lockptr()
+       returns pointer to PTE table lock;
+ - pmd_lock()
+       takes PMD table lock, returns pointer to taken lock;
+ - pmd_lockptr()
+       returns pointer to PMD table lock;
+
+Split page table lock for PTE tables is enabled compile-time if
+CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
+If split lock is disabled, all tables guaded by mm->page_table_lock.
+
+Split page table lock for PMD tables is enabled, if it's enabled for PTE
+tables and the architecture supports it (see below).
+
+Hugetlb and split page table lock
+---------------------------------
+
+Hugetlb can support several page sizes. We use split lock only for PMD
+level, but not for PUD.
+
+Hugetlb-specific helpers:
+ - huge_pte_lock()
+       takes pmd split lock for PMD_SIZE page, mm->page_table_lock
+       otherwise;
+ - huge_pte_lockptr()
+       returns pointer to table lock;
+
+Support of split page table lock by an architecture
+---------------------------------------------------
+
+There's no need in special enabling of PTE split page table lock:
+everything required is done by pgtable_page_ctor() and pgtable_page_dtor(),
+which must be called on PTE table allocation / freeing.
+
+Make sure the architecture doesn't use slab allocator for page table
+allocation: slab uses page->slab_cache and page->first_page for its pages.
+These fields share storage with page->ptl.
+
+PMD split lock only makes sense if you have more than two page table
+levels.
+
+PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
+allocation and pgtable_pmd_page_dtor() on freeing.
+
+Allocation usually happens in pmd_alloc_one(), freeing in pmd_free(), but
+make sure you cover all PMD table allocation / freeing paths: i.e X86_PAE
+preallocate few PMDs on pgd_alloc().
+
+With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
+
+NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
+be handled properly.
+
+page->ptl
+---------
+
+page->ptl is used to access split page table lock, where 'page' is struct
+page of page containing the table. It shares storage with page->private
+(and few other fields in union).
+
+To avoid increasing size of struct page and have best performance, we use a
+trick:
+ - if spinlock_t fits into long, we use page->ptr as spinlock, so we
+   can avoid indirect access and save a cache line.
+ - if size of spinlock_t is bigger then size of long, we use page->ptl as
+   pointer to spinlock_t and allocate it dynamically. This allows to use
+   split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
+   one more cache line for indirect access;
+
+The spinlock_t allocated in pgtable_page_ctor() for PTE table and in
+pgtable_pmd_page_ctor() for PMD table.
+
+Please, never access page->ptl directly -- use appropriate helper.
index f3ef1d1..583af4b 100644 (file)
@@ -4871,7 +4871,8 @@ KERNEL VIRTUAL MACHINE (KVM)
 M:     Gleb Natapov <gleb@redhat.com>
 M:     Paolo Bonzini <pbonzini@redhat.com>
 L:     kvm@vger.kernel.org
-W:     http://linux-kvm.org
+W:     http://www.linux-kvm.org
+T:     git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
 S:     Supported
 F:     Documentation/*/kvm*.txt
 F:     Documentation/virtual/kvm/
index ded747c..f1cf895 100644 (file)
@@ -207,9 +207,6 @@ config HAVE_DMA_ATTRS
 config HAVE_DMA_CONTIGUOUS
        bool
 
-config USE_GENERIC_SMP_HELPERS
-       bool
-
 config GENERIC_SMP_IDLE_THREAD
        bool
 
index 35a300d..8d2a483 100644 (file)
@@ -522,7 +522,6 @@ config ARCH_MAY_HAVE_PC_FDC
 config SMP
        bool "Symmetric multi-processing support"
        depends on ALPHA_SABLE || ALPHA_LYNX || ALPHA_RAWHIDE || ALPHA_DP264 || ALPHA_WILDFIRE || ALPHA_TITAN || ALPHA_GENERIC || ALPHA_SHARK || ALPHA_MARVEL
-       select USE_GENERIC_SMP_HELPERS
        ---help---
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, like most personal computers, say N. If
index bc2a0da..aab14a0 100644 (file)
@@ -72,7 +72,10 @@ pte_alloc_one(struct mm_struct *mm, unsigned long address)
        if (!pte)
                return NULL;
        page = virt_to_page(pte);
-       pgtable_page_ctor(page);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
        return page;
 }
 
index 5ede546..2ee0c9b 100644 (file)
@@ -125,7 +125,6 @@ config ARC_PLAT_NEEDS_CPU_TO_DMA
 config SMP
        bool "Symmetric Multi-Processing (Incomplete)"
        default n
-       select USE_GENERIC_SMP_HELPERS
        help
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, like most personal computers, say N. If
index 36a9f20..81208bf 100644 (file)
@@ -105,11 +105,16 @@ static inline pgtable_t
 pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
        pgtable_t pte_pg;
+       struct page *page;
 
        pte_pg = __get_free_pages(GFP_KERNEL | __GFP_REPEAT, __get_order_pte());
-       if (pte_pg) {
-               memzero((void *)pte_pg, PTRS_PER_PTE * 4);
-               pgtable_page_ctor(virt_to_page(pte_pg));
+       if (!pte_pg)
+               return 0;
+       memzero((void *)pte_pg, PTRS_PER_PTE * 4);
+       page = virt_to_page(pte_pg);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return 0;
        }
 
        return pte_pg;
index 603d661..e089e62 100644 (file)
@@ -1432,7 +1432,6 @@ config SMP
        depends on GENERIC_CLOCKEVENTS
        depends on HAVE_SMP
        depends on MMU || ARM_MPU
-       select USE_GENERIC_SMP_HELPERS
        help
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, like most personal computers, say N. If
@@ -1863,6 +1862,12 @@ config CC_STACKPROTECTOR
          neutralized via a kernel panic.
          This feature requires gcc version 4.2 or above.
 
+config SWIOTLB
+       def_bool y
+
+config IOMMU_HELPER
+       def_bool SWIOTLB
+
 config XEN_DOM0
        def_bool y
        depends on XEN
@@ -1873,6 +1878,7 @@ config XEN
        depends on CPU_V7 && !CPU_V6
        depends on !GENERIC_ATOMIC64
        select ARM_PSCI
+       select SWIOTLB_XEN
        help
          Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
 
index 863cd84..e701a4d 100644 (file)
 #include <asm-generic/dma-coherent.h>
 #include <asm/memory.h>
 
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+
 #define DMA_ERROR_CODE (~0)
 extern struct dma_map_ops arm_dma_ops;
 extern struct dma_map_ops arm_coherent_dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
        if (dev && dev->archdata.dma_ops)
                return dev->archdata.dma_ops;
        return &arm_dma_ops;
 }
 
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+{
+       if (xen_initial_domain())
+               return xen_dma_ops;
+       else
+               return __generic_dma_ops(dev);
+}
+
 static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
 {
        BUG_ON(!dev);
@@ -94,6 +105,39 @@ static inline unsigned long dma_max_pfn(struct device *dev)
 }
 #define dma_max_pfn(dev) dma_max_pfn(dev)
 
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+       unsigned int offset = paddr & ~PAGE_MASK;
+       return pfn_to_dma(dev, __phys_to_pfn(paddr)) + offset;
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
+{
+       unsigned int offset = dev_addr & ~PAGE_MASK;
+       return __pfn_to_phys(dma_to_pfn(dev, dev_addr)) + offset;
+}
+
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+       u64 limit, mask;
+
+       if (!dev->dma_mask)
+               return 0;
+
+       mask = *dev->dma_mask;
+
+       limit = (mask + 1) & ~mask;
+       if (limit && size > limit)
+               return 0;
+
+       if ((addr | (addr + size - 1)) & ~mask)
+               return 0;
+
+       return 1;
+}
+
+static inline void dma_mark_clean(void *addr, size_t size) { }
+
 /*
  * DMA errors are defined by all-bits-set in the DMA address.
  */
index d070741..3c597c2 100644 (file)
 #ifdef __KERNEL__
 
 #include <linux/types.h>
+#include <linux/blk_types.h>
 #include <asm/byteorder.h>
 #include <asm/memory.h>
 #include <asm-generic/pci_iomap.h>
+#include <xen/xen.h>
 
 /*
  * ISA I/O bus memory addresses are 1:1 with the physical address.
@@ -372,6 +374,13 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr);
 #define BIOVEC_MERGEABLE(vec1, vec2)   \
        ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
 
+struct bio_vec;
+extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+                                     const struct bio_vec *vec2);
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                              \
+       (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&                         \
+        (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+
 #ifdef CONFIG_MMU
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
 extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
index 64e9696..1d3153c 100644 (file)
@@ -57,6 +57,7 @@
  * TSC:                Trap SMC
  * TSW:                Trap cache operations by set/way
  * TWI:                Trap WFI
+ * TWE:                Trap WFE
  * TIDCP:      Trap L2CTLR/L2ECTLR
  * BSU_IS:     Upgrade barriers to the inner shareable domain
  * FB:         Force broadcast of all maintainance operations
@@ -67,7 +68,7 @@
  */
 #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
                        HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
-                       HCR_SWIO | HCR_TIDCP)
+                       HCR_TWE | HCR_SWIO | HCR_TIDCP)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
 
 /* System Control Register (SCTLR) bits */
 #define TTBCR_IRGN1    (3 << 24)
 #define TTBCR_EPD1     (1 << 23)
 #define TTBCR_A1       (1 << 22)
-#define TTBCR_T1SZ     (3 << 16)
+#define TTBCR_T1SZ     (7 << 16)
 #define TTBCR_SH0      (3 << 12)
 #define TTBCR_ORGN0    (3 << 10)
 #define TTBCR_IRGN0    (3 << 8)
 #define TTBCR_EPD0     (1 << 7)
-#define TTBCR_T0SZ     3
+#define TTBCR_T0SZ     (7 << 0)
 #define HTCR_MASK      (TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)
 
 /* Hyp System Trap Register */
 #define HSR_EC_DABT    (0x24)
 #define HSR_EC_DABT_HYP        (0x25)
 
+#define HSR_WFI_IS_WFE         (1U << 0)
+
 #define HSR_HVC_IMM_MASK       ((1UL << 16) - 1)
 
 #define HSR_DABT_S1PTW         (1U << 7)
index a2f43dd..661da11 100644 (file)
@@ -39,7 +39,7 @@
 #define c6_IFAR                17      /* Instruction Fault Address Register */
 #define c7_PAR         18      /* Physical Address Register */
 #define c7_PAR_high    19      /* PAR top 32 bits */
-#define c9_L2CTLR      20      /* Cortex A15 L2 Control Register */
+#define c9_L2CTLR      20      /* Cortex A15/A7 L2 Control Register */
 #define c10_PRRR       21      /* Primary Region Remap Register */
 #define c10_NMRR       22      /* Normal Memory Remap Register */
 #define c12_VBAR       23      /* Vector Base Address Register */
index a464e8d..0fa90c9 100644 (file)
@@ -157,4 +157,55 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)
        return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK;
 }
 
+static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cp15[c0_MPIDR];
+}
+
+static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
+{
+       *vcpu_cpsr(vcpu) |= PSR_E_BIT;
+}
+
+static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu)
+{
+       return !!(*vcpu_cpsr(vcpu) & PSR_E_BIT);
+}
+
+static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu,
+                                                   unsigned long data,
+                                                   unsigned int len)
+{
+       if (kvm_vcpu_is_be(vcpu)) {
+               switch (len) {
+               case 1:
+                       return data & 0xff;
+               case 2:
+                       return be16_to_cpu(data & 0xffff);
+               default:
+                       return be32_to_cpu(data);
+               }
+       }
+
+       return data;            /* Leave LE untouched */
+}
+
+static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
+                                                   unsigned long data,
+                                                   unsigned int len)
+{
+       if (kvm_vcpu_is_be(vcpu)) {
+               switch (len) {
+               case 1:
+                       return data & 0xff;
+               case 2:
+                       return cpu_to_be16(data & 0xffff);
+               default:
+                       return cpu_to_be32(data);
+               }
+       }
+
+       return data;            /* Leave LE untouched */
+}
+
 #endif /* __ARM_KVM_EMULATE_H__ */
index 7d22517..8a6f6db 100644 (file)
 
 #define KVM_VCPU_MAX_FEATURES 1
 
-/* We don't currently support large pages. */
-#define KVM_HPAGE_GFN_SHIFT(x) 0
-#define KVM_NR_PAGE_SIZES      1
-#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
-
 #include <kvm/arm_vgic.h>
 
 struct kvm_vcpu;
@@ -154,6 +149,7 @@ struct kvm_vcpu_stat {
 struct kvm_vcpu_init;
 int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
                        const struct kvm_vcpu_init *init);
+int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
 struct kvm_one_reg;
index 9b28c41..77de4a4 100644 (file)
@@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void);
 int kvm_mmu_init(void);
 void kvm_clear_hyp_idmap(void);
 
+static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
+{
+       *pmd = new_pmd;
+       flush_pmd_entry(pmd);
+}
+
 static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
 {
        *pte = new_pte;
@@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
        pte_val(*pte) |= L_PTE_S2_RDWR;
 }
 
+static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+{
+       pmd_val(*pmd) |= L_PMD_S2_RDWR;
+}
+
 struct kvm;
 
-static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
+                                             unsigned long size)
 {
        /*
         * If we are going to insert an instruction page and the icache is
@@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
         * need any kind of flushing (DDI 0406C.b - Page B3-1392).
         */
        if (icache_is_pipt()) {
-               unsigned long hva = gfn_to_hva(kvm, gfn);
-               __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
+               __cpuc_coherent_user_range(hva, hva + size);
        } else if (!icache_is_vivt_asid_tagged()) {
                /* any kind of VIPT cache */
                __flush_icache_all();
index 943504f..78a7793 100644 (file)
@@ -102,12 +102,14 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr)
 #else
        pte = alloc_pages(PGALLOC_GFP, 0);
 #endif
-       if (pte) {
-               if (!PageHighMem(pte))
-                       clean_pte_table(page_address(pte));
-               pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       if (!PageHighMem(pte))
+               clean_pte_table(page_address(pte));
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
        }
-
        return pte;
 }
 
index 39c54cf..4f95039 100644 (file)
 #define L_PTE_S2_RDONLY                 (_AT(pteval_t, 1) << 6)   /* HAP[1]   */
 #define L_PTE_S2_RDWR           (_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
 
+#define L_PMD_S2_RDWR           (_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
+
 /*
  * Hyp-mode PL2 PTE definitions for LPAE.
  */
index d7ab99a..1317ee4 100644 (file)
@@ -16,4 +16,6 @@ static inline enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
        return PARAVIRT_LAZY_NONE;
 }
 
+extern struct dma_map_ops *xen_dma_ops;
+
 #endif /* _ASM_ARM_XEN_HYPERVISOR_H */
diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h
new file mode 100644 (file)
index 0000000..1109017
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef _ASM_ARM_XEN_PAGE_COHERENT_H
+#define _ASM_ARM_XEN_PAGE_COHERENT_H
+
+#include <asm/page.h>
+#include <linux/dma-attrs.h>
+#include <linux/dma-mapping.h>
+
+static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size,
+               dma_addr_t *dma_handle, gfp_t flags,
+               struct dma_attrs *attrs)
+{
+       return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs);
+}
+
+static inline void xen_free_coherent_pages(struct device *hwdev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle,
+               struct dma_attrs *attrs)
+{
+       __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs);
+}
+
+static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
+            unsigned long offset, size_t size, enum dma_data_direction dir,
+            struct dma_attrs *attrs)
+{
+       __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs);
+}
+
+static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
+               size_t size, enum dma_data_direction dir,
+               struct dma_attrs *attrs)
+{
+       if (__generic_dma_ops(hwdev)->unmap_page)
+               __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs);
+}
+
+static inline void xen_dma_sync_single_for_cpu(struct device *hwdev,
+               dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+       if (__generic_dma_ops(hwdev)->sync_single_for_cpu)
+               __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir);
+}
+
+static inline void xen_dma_sync_single_for_device(struct device *hwdev,
+               dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+       if (__generic_dma_ops(hwdev)->sync_single_for_device)
+               __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir);
+}
+#endif /* _ASM_ARM_XEN_PAGE_COHERENT_H */
index 359a7b5..75579a9 100644 (file)
@@ -6,12 +6,12 @@
 
 #include <linux/pfn.h>
 #include <linux/types.h>
+#include <linux/dma-mapping.h>
 
+#include <xen/xen.h>
 #include <xen/interface/grant_table.h>
 
-#define pfn_to_mfn(pfn)                        (pfn)
 #define phys_to_machine_mapping_valid(pfn) (1)
-#define mfn_to_pfn(mfn)                        (mfn)
 #define mfn_to_virt(m)                 (__va(mfn_to_pfn(m) << PAGE_SHIFT))
 
 #define pte_mfn            pte_pfn
@@ -32,6 +32,38 @@ typedef struct xpaddr {
 
 #define INVALID_P2M_ENTRY      (~0UL)
 
+unsigned long __pfn_to_mfn(unsigned long pfn);
+unsigned long __mfn_to_pfn(unsigned long mfn);
+extern struct rb_root phys_to_mach;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+       unsigned long mfn;
+
+       if (phys_to_mach.rb_node != NULL) {
+               mfn = __pfn_to_mfn(pfn);
+               if (mfn != INVALID_P2M_ENTRY)
+                       return mfn;
+       }
+
+       return pfn;
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+       unsigned long pfn;
+
+       if (phys_to_mach.rb_node != NULL) {
+               pfn = __mfn_to_pfn(mfn);
+               if (pfn != INVALID_P2M_ENTRY)
+                       return pfn;
+       }
+
+       return mfn;
+}
+
+#define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn)
+
 static inline xmaddr_t phys_to_machine(xpaddr_t phys)
 {
        unsigned offset = phys.paddr & ~PAGE_MASK;
@@ -76,11 +108,9 @@ static inline int m2p_remove_override(struct page *page, bool clear_pte)
        return 0;
 }
 
-static inline bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
-       BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-       return true;
-}
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn,
+               unsigned long nr_pages);
 
 static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
index c1ee007..c498b60 100644 (file)
@@ -63,7 +63,8 @@ struct kvm_regs {
 
 /* Supported Processor Types */
 #define KVM_ARM_TARGET_CORTEX_A15      0
-#define KVM_ARM_NUM_TARGETS            1
+#define KVM_ARM_TARGET_CORTEX_A7       1
+#define KVM_ARM_NUM_TARGETS            2
 
 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
 #define KVM_ARM_DEVICE_TYPE_SHIFT      0
index ebf5015..466bd29 100644 (file)
@@ -20,6 +20,7 @@ config KVM
        bool "Kernel-based Virtual Machine (KVM) support"
        select PREEMPT_NOTIFIERS
        select ANON_INODES
+       select HAVE_KVM_CPU_RELAX_INTERCEPT
        select KVM_MMIO
        select KVM_ARM_HOST
        depends on ARM_VIRT_EXT && ARM_LPAE
index d99bee4..789bca9 100644 (file)
@@ -19,6 +19,6 @@ kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
 
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
-obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o
+obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
 obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
 obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
index aea7ccb..2a700e0 100644 (file)
@@ -152,12 +152,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                           unsigned long npages)
 {
        return 0;
 }
@@ -797,6 +798,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        return -EFAULT;
                return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
        }
+       case KVM_ARM_PREFERRED_TARGET: {
+               int err;
+               struct kvm_vcpu_init init;
+
+               err = kvm_vcpu_preferred_target(&init);
+               if (err)
+                       return err;
+
+               if (copy_to_user(argp, &init, sizeof(init)))
+                       return -EFAULT;
+
+               return 0;
+       }
        default:
                return -EINVAL;
        }
index db9cf69..78c0885 100644 (file)
@@ -71,6 +71,98 @@ int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run)
        return 1;
 }
 
+static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
+{
+       /*
+        * Compute guest MPIDR. We build a virtual cluster out of the
+        * vcpu_id, but we read the 'U' bit from the underlying
+        * hardware directly.
+        */
+       vcpu->arch.cp15[c0_MPIDR] = ((read_cpuid_mpidr() & MPIDR_SMP_BITMASK) |
+                                    ((vcpu->vcpu_id >> 2) << MPIDR_LEVEL_BITS) |
+                                    (vcpu->vcpu_id & 3));
+}
+
+/* TRM entries A7:4.3.31 A15:4.3.28 - RO WI */
+static bool access_actlr(struct kvm_vcpu *vcpu,
+                        const struct coproc_params *p,
+                        const struct coproc_reg *r)
+{
+       if (p->is_write)
+               return ignore_write(vcpu, p);
+
+       *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c1_ACTLR];
+       return true;
+}
+
+/* TRM entries A7:4.3.56, A15:4.3.60 - R/O. */
+static bool access_cbar(struct kvm_vcpu *vcpu,
+                       const struct coproc_params *p,
+                       const struct coproc_reg *r)
+{
+       if (p->is_write)
+               return write_to_read_only(vcpu, p);
+       return read_zero(vcpu, p);
+}
+
+/* TRM entries A7:4.3.49, A15:4.3.48 - R/O WI */
+static bool access_l2ctlr(struct kvm_vcpu *vcpu,
+                         const struct coproc_params *p,
+                         const struct coproc_reg *r)
+{
+       if (p->is_write)
+               return ignore_write(vcpu, p);
+
+       *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c9_L2CTLR];
+       return true;
+}
+
+static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
+{
+       u32 l2ctlr, ncores;
+
+       asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr));
+       l2ctlr &= ~(3 << 24);
+       ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1;
+       /* How many cores in the current cluster and the next ones */
+       ncores -= (vcpu->vcpu_id & ~3);
+       /* Cap it to the maximum number of cores in a single cluster */
+       ncores = min(ncores, 3U);
+       l2ctlr |= (ncores & 3) << 24;
+
+       vcpu->arch.cp15[c9_L2CTLR] = l2ctlr;
+}
+
+static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
+{
+       u32 actlr;
+
+       /* ACTLR contains SMP bit: make sure you create all cpus first! */
+       asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr));
+       /* Make the SMP bit consistent with the guest configuration */
+       if (atomic_read(&vcpu->kvm->online_vcpus) > 1)
+               actlr |= 1U << 6;
+       else
+               actlr &= ~(1U << 6);
+
+       vcpu->arch.cp15[c1_ACTLR] = actlr;
+}
+
+/*
+ * TRM entries: A7:4.3.50, A15:4.3.49
+ * R/O WI (even if NSACR.NS_L2ERR, a write of 1 is ignored).
+ */
+static bool access_l2ectlr(struct kvm_vcpu *vcpu,
+                          const struct coproc_params *p,
+                          const struct coproc_reg *r)
+{
+       if (p->is_write)
+               return ignore_write(vcpu, p);
+
+       *vcpu_reg(vcpu, p->Rt1) = 0;
+       return true;
+}
+
 /* See note at ARM ARM B1.14.4 */
 static bool access_dcsw(struct kvm_vcpu *vcpu,
                        const struct coproc_params *p,
@@ -153,10 +245,22 @@ static bool pm_fake(struct kvm_vcpu *vcpu,
  *            registers preceding 32-bit ones.
  */
 static const struct coproc_reg cp15_regs[] = {
+       /* MPIDR: we use VMPIDR for guest access. */
+       { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32,
+                       NULL, reset_mpidr, c0_MPIDR },
+
        /* CSSELR: swapped by interrupt.S. */
        { CRn( 0), CRm( 0), Op1( 2), Op2( 0), is32,
                        NULL, reset_unknown, c0_CSSELR },
 
+       /* ACTLR: trapped by HCR.TAC bit. */
+       { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32,
+                       access_actlr, reset_actlr, c1_ACTLR },
+
+       /* CPACR: swapped by interrupt.S. */
+       { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32,
+                       NULL, reset_val, c1_CPACR, 0x00000000 },
+
        /* TTBR0/TTBR1: swapped by interrupt.S. */
        { CRm64( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 },
        { CRm64( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 },
@@ -194,6 +298,13 @@ static const struct coproc_reg cp15_regs[] = {
        { CRn( 7), CRm( 6), Op1( 0), Op2( 2), is32, access_dcsw},
        { CRn( 7), CRm(10), Op1( 0), Op2( 2), is32, access_dcsw},
        { CRn( 7), CRm(14), Op1( 0), Op2( 2), is32, access_dcsw},
+       /*
+        * L2CTLR access (guest wants to know #CPUs).
+        */
+       { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32,
+                       access_l2ctlr, reset_l2ctlr, c9_L2CTLR },
+       { CRn( 9), CRm( 0), Op1( 1), Op2( 3), is32, access_l2ectlr},
+
        /*
         * Dummy performance monitor implementation.
         */
@@ -234,6 +345,9 @@ static const struct coproc_reg cp15_regs[] = {
        /* CNTKCTL: swapped by interrupt.S. */
        { CRn(14), CRm( 1), Op1( 0), Op2( 0), is32,
                        NULL, reset_val, c14_CNTKCTL, 0x00000000 },
+
+       /* The Configuration Base Address Register. */
+       { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar},
 };
 
 /* Target specific emulation tables */
@@ -241,6 +355,12 @@ static struct kvm_coproc_target_table *target_tables[KVM_ARM_NUM_TARGETS];
 
 void kvm_register_target_coproc_table(struct kvm_coproc_target_table *table)
 {
+       unsigned int i;
+
+       for (i = 1; i < table->num; i++)
+               BUG_ON(cmp_reg(&table->table[i-1],
+                              &table->table[i]) >= 0);
+
        target_tables[table->target] = table;
 }
 
index cf93472..bb0cac1 100644 (file)
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
 #include <linux/kvm_host.h>
-#include <asm/cputype.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_host.h>
-#include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
+#include <asm/kvm_emulate.h>
 #include <linux/init.h>
 
-static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
-{
-       /*
-        * Compute guest MPIDR:
-        * (Even if we present only one VCPU to the guest on an SMP
-        * host we don't set the U bit in the MPIDR, or vice versa, as
-        * revealing the underlying hardware properties is likely to
-        * be the best choice).
-        */
-       vcpu->arch.cp15[c0_MPIDR] = (read_cpuid_mpidr() & ~MPIDR_LEVEL_MASK)
-               | (vcpu->vcpu_id & MPIDR_LEVEL_MASK);
-}
-
 #include "coproc.h"
 
-/* A15 TRM 4.3.28: RO WI */
-static bool access_actlr(struct kvm_vcpu *vcpu,
-                        const struct coproc_params *p,
-                        const struct coproc_reg *r)
-{
-       if (p->is_write)
-               return ignore_write(vcpu, p);
-
-       *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c1_ACTLR];
-       return true;
-}
-
-/* A15 TRM 4.3.60: R/O. */
-static bool access_cbar(struct kvm_vcpu *vcpu,
-                       const struct coproc_params *p,
-                       const struct coproc_reg *r)
-{
-       if (p->is_write)
-               return write_to_read_only(vcpu, p);
-       return read_zero(vcpu, p);
-}
-
-/* A15 TRM 4.3.48: R/O WI. */
-static bool access_l2ctlr(struct kvm_vcpu *vcpu,
-                         const struct coproc_params *p,
-                         const struct coproc_reg *r)
-{
-       if (p->is_write)
-               return ignore_write(vcpu, p);
-
-       *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c9_L2CTLR];
-       return true;
-}
-
-static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
-{
-       u32 l2ctlr, ncores;
-
-       asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr));
-       l2ctlr &= ~(3 << 24);
-       ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1;
-       l2ctlr |= (ncores & 3) << 24;
-
-       vcpu->arch.cp15[c9_L2CTLR] = l2ctlr;
-}
-
-static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
-{
-       u32 actlr;
-
-       /* ACTLR contains SMP bit: make sure you create all cpus first! */
-       asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr));
-       /* Make the SMP bit consistent with the guest configuration */
-       if (atomic_read(&vcpu->kvm->online_vcpus) > 1)
-               actlr |= 1U << 6;
-       else
-               actlr &= ~(1U << 6);
-
-       vcpu->arch.cp15[c1_ACTLR] = actlr;
-}
-
-/* A15 TRM 4.3.49: R/O WI (even if NSACR.NS_L2ERR, a write of 1 is ignored). */
-static bool access_l2ectlr(struct kvm_vcpu *vcpu,
-                          const struct coproc_params *p,
-                          const struct coproc_reg *r)
-{
-       if (p->is_write)
-               return ignore_write(vcpu, p);
-
-       *vcpu_reg(vcpu, p->Rt1) = 0;
-       return true;
-}
-
 /*
  * A15-specific CP15 registers.
  * CRn denotes the primary register number, but is copied to the CRm in the
@@ -121,29 +32,9 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu,
  *            registers preceding 32-bit ones.
  */
 static const struct coproc_reg a15_regs[] = {
-       /* MPIDR: we use VMPIDR for guest access. */
-       { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32,
-                       NULL, reset_mpidr, c0_MPIDR },
-
        /* SCTLR: swapped by interrupt.S. */
        { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
                        NULL, reset_val, c1_SCTLR, 0x00C50078 },
-       /* ACTLR: trapped by HCR.TAC bit. */
-       { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32,
-                       access_actlr, reset_actlr, c1_ACTLR },
-       /* CPACR: swapped by interrupt.S. */
-       { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32,
-                       NULL, reset_val, c1_CPACR, 0x00000000 },
-
-       /*
-        * L2CTLR access (guest wants to know #CPUs).
-        */
-       { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32,
-                       access_l2ctlr, reset_l2ctlr, c9_L2CTLR },
-       { CRn( 9), CRm( 0), Op1( 1), Op2( 3), is32, access_l2ectlr},
-
-       /* The Configuration Base Address Register. */
-       { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar},
 };
 
 static struct kvm_coproc_target_table a15_target_table = {
@@ -154,12 +45,6 @@ static struct kvm_coproc_target_table a15_target_table = {
 
 static int __init coproc_a15_init(void)
 {
-       unsigned int i;
-
-       for (i = 1; i < ARRAY_SIZE(a15_regs); i++)
-               BUG_ON(cmp_reg(&a15_regs[i-1],
-                              &a15_regs[i]) >= 0);
-
        kvm_register_target_coproc_table(&a15_target_table);
        return 0;
 }
diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
new file mode 100644 (file)
index 0000000..1df7673
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Copyright (C) 2013 - ARM Ltd
+ *
+ * Authors: Rusty Russell <rusty@rustcorp.au>
+ *          Christoffer Dall <c.dall@virtualopensystems.com>
+ *          Jonathan Austin <jonathan.austin@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include <linux/kvm_host.h>
+#include <asm/kvm_coproc.h>
+#include <asm/kvm_emulate.h>
+#include <linux/init.h>
+
+#include "coproc.h"
+
+/*
+ * Cortex-A7 specific CP15 registers.
+ * CRn denotes the primary register number, but is copied to the CRm in the
+ * user space API for 64-bit register access in line with the terminology used
+ * in the ARM ARM.
+ * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 64-bit
+ *            registers preceding 32-bit ones.
+ */
+static const struct coproc_reg a7_regs[] = {
+       /* SCTLR: swapped by interrupt.S. */
+       { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
+                       NULL, reset_val, c1_SCTLR, 0x00C50878 },
+};
+
+static struct kvm_coproc_target_table a7_target_table = {
+       .target = KVM_ARM_TARGET_CORTEX_A7,
+       .table = a7_regs,
+       .num = ARRAY_SIZE(a7_regs),
+};
+
+static int __init coproc_a7_init(void)
+{
+       kvm_register_target_coproc_table(&a7_target_table);
+       return 0;
+}
+late_initcall(coproc_a7_init);
index bdede9e..d6c0052 100644 (file)
@@ -354,7 +354,7 @@ static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr)
        *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset;
 
        if (is_pabt) {
-               /* Set DFAR and DFSR */
+               /* Set IFAR and IFSR */
                vcpu->arch.cp15[c6_IFAR] = addr;
                is_lpae = (vcpu->arch.cp15[c2_TTBCR] >> 31);
                /* Always give debug fault for now - should give guest a clue */
index 152d036..20f8d97 100644 (file)
@@ -190,6 +190,8 @@ int __attribute_const__ kvm_target_cpu(void)
                return -EINVAL;
 
        switch (part_number) {
+       case ARM_CPU_PART_CORTEX_A7:
+               return KVM_ARM_TARGET_CORTEX_A7;
        case ARM_CPU_PART_CORTEX_A15:
                return KVM_ARM_TARGET_CORTEX_A15;
        default:
@@ -202,7 +204,7 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
 {
        unsigned int i;
 
-       /* We can only do a cortex A15 for now. */
+       /* We can only cope with guest==host and only on A15/A7 (for now). */
        if (init->target != kvm_target_cpu())
                return -EINVAL;
 
@@ -222,6 +224,26 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
        return kvm_reset_vcpu(vcpu);
 }
 
+int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
+{
+       int target = kvm_target_cpu();
+
+       if (target < 0)
+               return -ENODEV;
+
+       memset(init, 0, sizeof(*init));
+
+       /*
+        * For now, we don't return any features.
+        * In future, we might use features to return target
+        * specific features available for the preferred
+        * target type.
+        */
+       init->target = (__u32)target;
+
+       return 0;
+}
+
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        return -EINVAL;
index df4c82d..a920790 100644 (file)
@@ -73,23 +73,29 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
 }
 
 /**
- * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest
+ * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests
  * @vcpu:      the vcpu pointer
  * @run:       the kvm_run structure pointer
  *
- * Simply sets the wait_for_interrupts flag on the vcpu structure, which will
- * halt execution of world-switches and schedule other host processes until
- * there is an incoming IRQ or FIQ to the VM.
+ * WFE: Yield the CPU and come back to this vcpu when the scheduler
+ * decides to.
+ * WFI: Simply call kvm_vcpu_block(), which will halt execution of
+ * world-switches and schedule other host processes until there is an
+ * incoming IRQ or FIQ to the VM.
  */
-static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run)
+static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
        trace_kvm_wfi(*vcpu_pc(vcpu));
-       kvm_vcpu_block(vcpu);
+       if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE)
+               kvm_vcpu_on_spin(vcpu);
+       else
+               kvm_vcpu_block(vcpu);
+
        return 1;
 }
 
 static exit_handle_fn arm_exit_handlers[] = {
-       [HSR_EC_WFI]            = kvm_handle_wfi,
+       [HSR_EC_WFI]            = kvm_handle_wfx,
        [HSR_EC_CP15_32]        = kvm_handle_cp15_32,
        [HSR_EC_CP15_64]        = kvm_handle_cp15_64,
        [HSR_EC_CP14_MR]        = kvm_handle_cp14_access,
index 0c25d94..4cb5a93 100644 (file)
 
 #include "trace.h"
 
+static void mmio_write_buf(char *buf, unsigned int len, unsigned long data)
+{
+       void *datap = NULL;
+       union {
+               u8      byte;
+               u16     hword;
+               u32     word;
+               u64     dword;
+       } tmp;
+
+       switch (len) {
+       case 1:
+               tmp.byte        = data;
+               datap           = &tmp.byte;
+               break;
+       case 2:
+               tmp.hword       = data;
+               datap           = &tmp.hword;
+               break;
+       case 4:
+               tmp.word        = data;
+               datap           = &tmp.word;
+               break;
+       case 8:
+               tmp.dword       = data;
+               datap           = &tmp.dword;
+               break;
+       }
+
+       memcpy(buf, datap, len);
+}
+
+static unsigned long mmio_read_buf(char *buf, unsigned int len)
+{
+       unsigned long data = 0;
+       union {
+               u16     hword;
+               u32     word;
+               u64     dword;
+       } tmp;
+
+       switch (len) {
+       case 1:
+               data = buf[0];
+               break;
+       case 2:
+               memcpy(&tmp.hword, buf, len);
+               data = tmp.hword;
+               break;
+       case 4:
+               memcpy(&tmp.word, buf, len);
+               data = tmp.word;
+               break;
+       case 8:
+               memcpy(&tmp.dword, buf, len);
+               data = tmp.dword;
+               break;
+       }
+
+       return data;
+}
+
 /**
  * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
  * @vcpu: The VCPU pointer
  */
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-       unsigned long *dest;
+       unsigned long data;
        unsigned int len;
        int mask;
 
        if (!run->mmio.is_write) {
-               dest = vcpu_reg(vcpu, vcpu->arch.mmio_decode.rt);
-               *dest = 0;
-
                len = run->mmio.len;
                if (len > sizeof(unsigned long))
                        return -EINVAL;
 
-               memcpy(dest, run->mmio.data, len);
-
-               trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
-                               *((u64 *)run->mmio.data));
+               data = mmio_read_buf(run->mmio.data, len);
 
                if (vcpu->arch.mmio_decode.sign_extend &&
                    len < sizeof(unsigned long)) {
                        mask = 1U << ((len * 8) - 1);
-                       *dest = (*dest ^ mask) - mask;
+                       data = (data ^ mask) - mask;
                }
+
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
+                              data);
+               data = vcpu_data_host_to_guest(vcpu, data, len);
+               *vcpu_reg(vcpu, vcpu->arch.mmio_decode.rt) = data;
        }
 
        return 0;
@@ -105,6 +166,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 phys_addr_t fault_ipa)
 {
        struct kvm_exit_mmio mmio;
+       unsigned long data;
        unsigned long rt;
        int ret;
 
@@ -125,13 +187,15 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
        }
 
        rt = vcpu->arch.mmio_decode.rt;
+       data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), mmio.len);
+
        trace_kvm_mmio((mmio.is_write) ? KVM_TRACE_MMIO_WRITE :
                                         KVM_TRACE_MMIO_READ_UNSATISFIED,
                        mmio.len, fault_ipa,
-                       (mmio.is_write) ? *vcpu_reg(vcpu, rt) : 0);
+                       (mmio.is_write) ? data : 0);
 
        if (mmio.is_write)
-               memcpy(mmio.data, vcpu_reg(vcpu, rt), mmio.len);
+               mmio_write_buf(mmio.data, mmio.len, data);
 
        if (vgic_handle_mmio(vcpu, run, &mmio))
                return 1;
index b0de86b..3719583 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/mman.h>
 #include <linux/kvm_host.h>
 #include <linux/io.h>
+#include <linux/hugetlb.h>
 #include <trace/events/kvm.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
@@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start;
 static unsigned long hyp_idmap_end;
 static phys_addr_t hyp_idmap_vector;
 
+#define kvm_pmd_huge(_x)       (pmd_huge(_x) || pmd_trans_huge(_x))
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
        /*
@@ -93,19 +96,29 @@ static bool page_empty(void *ptr)
 
 static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 {
-       pmd_t *pmd_table = pmd_offset(pud, 0);
-       pud_clear(pud);
-       kvm_tlb_flush_vmid_ipa(kvm, addr);
-       pmd_free(NULL, pmd_table);
+       if (pud_huge(*pud)) {
+               pud_clear(pud);
+               kvm_tlb_flush_vmid_ipa(kvm, addr);
+       } else {
+               pmd_t *pmd_table = pmd_offset(pud, 0);
+               pud_clear(pud);
+               kvm_tlb_flush_vmid_ipa(kvm, addr);
+               pmd_free(NULL, pmd_table);
+       }
        put_page(virt_to_page(pud));
 }
 
 static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 {
-       pte_t *pte_table = pte_offset_kernel(pmd, 0);
-       pmd_clear(pmd);
-       kvm_tlb_flush_vmid_ipa(kvm, addr);
-       pte_free_kernel(NULL, pte_table);
+       if (kvm_pmd_huge(*pmd)) {
+               pmd_clear(pmd);
+               kvm_tlb_flush_vmid_ipa(kvm, addr);
+       } else {
+               pte_t *pte_table = pte_offset_kernel(pmd, 0);
+               pmd_clear(pmd);
+               kvm_tlb_flush_vmid_ipa(kvm, addr);
+               pte_free_kernel(NULL, pte_table);
+       }
        put_page(virt_to_page(pmd));
 }
 
@@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
                        continue;
                }
 
+               if (pud_huge(*pud)) {
+                       /*
+                        * If we are dealing with a huge pud, just clear it and
+                        * move on.
+                        */
+                       clear_pud_entry(kvm, pud, addr);
+                       addr = pud_addr_end(addr, end);
+                       continue;
+               }
+
                pmd = pmd_offset(pud, addr);
                if (pmd_none(*pmd)) {
                        addr = pmd_addr_end(addr, end);
                        continue;
                }
 
-               pte = pte_offset_kernel(pmd, addr);
-               clear_pte_entry(kvm, pte, addr);
-               next = addr + PAGE_SIZE;
+               if (!kvm_pmd_huge(*pmd)) {
+                       pte = pte_offset_kernel(pmd, addr);
+                       clear_pte_entry(kvm, pte, addr);
+                       next = addr + PAGE_SIZE;
+               }
 
-               /* If we emptied the pte, walk back up the ladder */
-               if (page_empty(pte)) {
+               /*
+                * If the pmd entry is to be cleared, walk back up the ladder
+                */
+               if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
                        clear_pmd_entry(kvm, pmd, addr);
                        next = pmd_addr_end(addr, end);
                        if (page_empty(pmd) && !page_empty(pud)) {
@@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
        kvm->arch.pgd = NULL;
 }
 
-
-static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-                         phys_addr_t addr, const pte_t *new_pte, bool iomap)
+static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+                            phys_addr_t addr)
 {
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
-       pte_t *pte, old_pte;
 
-       /* Create 2nd stage page table mapping - Level 1 */
        pgd = kvm->arch.pgd + pgd_index(addr);
        pud = pud_offset(pgd, addr);
        if (pud_none(*pud)) {
                if (!cache)
-                       return 0; /* ignore calls from kvm_set_spte_hva */
+                       return NULL;
                pmd = mmu_memory_cache_alloc(cache);
                pud_populate(NULL, pud, pmd);
                get_page(virt_to_page(pud));
        }
 
-       pmd = pmd_offset(pud, addr);
+       return pmd_offset(pud, addr);
+}
+
+static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
+                              *cache, phys_addr_t addr, const pmd_t *new_pmd)
+{
+       pmd_t *pmd, old_pmd;
+
+       pmd = stage2_get_pmd(kvm, cache, addr);
+       VM_BUG_ON(!pmd);
+
+       /*
+        * Mapping in huge pages should only happen through a fault.  If a
+        * page is merged into a transparent huge page, the individual
+        * subpages of that huge page should be unmapped through MMU
+        * notifiers before we get here.
+        *
+        * Merging of CompoundPages is not supported; they should become
+        * splitting first, unmapped, merged, and mapped back in on-demand.
+        */
+       VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
+
+       old_pmd = *pmd;
+       kvm_set_pmd(pmd, *new_pmd);
+       if (pmd_present(old_pmd))
+               kvm_tlb_flush_vmid_ipa(kvm, addr);
+       else
+               get_page(virt_to_page(pmd));
+       return 0;
+}
+
+static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+                         phys_addr_t addr, const pte_t *new_pte, bool iomap)
+{
+       pmd_t *pmd;
+       pte_t *pte, old_pte;
 
-       /* Create 2nd stage page table mapping - Level 2 */
+       /* Create stage-2 page table mapping - Level 1 */
+       pmd = stage2_get_pmd(kvm, cache, addr);
+       if (!pmd) {
+               /*
+                * Ignore calls from kvm_set_spte_hva for unallocated
+                * address ranges.
+                */
+               return 0;
+       }
+
+       /* Create stage-2 page mappings - Level 2 */
        if (pmd_none(*pmd)) {
                if (!cache)
                        return 0; /* ignore calls from kvm_set_spte_hva */
@@ -507,16 +576,60 @@ out:
        return ret;
 }
 
+static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
+{
+       pfn_t pfn = *pfnp;
+       gfn_t gfn = *ipap >> PAGE_SHIFT;
+
+       if (PageTransCompound(pfn_to_page(pfn))) {
+               unsigned long mask;
+               /*
+                * The address we faulted on is backed by a transparent huge
+                * page.  However, because we map the compound huge page and
+                * not the individual tail page, we need to transfer the
+                * refcount to the head page.  We have to be careful that the
+                * THP doesn't start to split while we are adjusting the
+                * refcounts.
+                *
+                * We are sure this doesn't happen, because mmu_notifier_retry
+                * was successful and we are holding the mmu_lock, so if this
+                * THP is trying to split, it will be blocked in the mmu
+                * notifier before touching any of the pages, specifically
+                * before being able to call __split_huge_page_refcount().
+                *
+                * We can therefore safely transfer the refcount from PG_tail
+                * to PG_head and switch the pfn from a tail page to the head
+                * page accordingly.
+                */
+               mask = PTRS_PER_PMD - 1;
+               VM_BUG_ON((gfn & mask) != (pfn & mask));
+               if (pfn & mask) {
+                       *ipap &= PMD_MASK;
+                       kvm_release_pfn_clean(pfn);
+                       pfn &= ~mask;
+                       kvm_get_pfn(pfn);
+                       *pfnp = pfn;
+               }
+
+               return true;
+       }
+
+       return false;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-                         gfn_t gfn, struct kvm_memory_slot *memslot,
+                         struct kvm_memory_slot *memslot,
                          unsigned long fault_status)
 {
-       pte_t new_pte;
-       pfn_t pfn;
        int ret;
-       bool write_fault, writable;
+       bool write_fault, writable, hugetlb = false, force_pte = false;
        unsigned long mmu_seq;
+       gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+       unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
+       struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+       struct vm_area_struct *vma;
+       pfn_t pfn;
 
        write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
        if (fault_status == FSC_PERM && !write_fault) {
@@ -524,6 +637,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                return -EFAULT;
        }
 
+       /* Let's check if we will get back a huge page backed by hugetlbfs */
+       down_read(&current->mm->mmap_sem);
+       vma = find_vma_intersection(current->mm, hva, hva + 1);
+       if (is_vm_hugetlb_page(vma)) {
+               hugetlb = true;
+               gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
+       } else {
+               /*
+                * Pages belonging to VMAs not aligned to the PMD mapping
+                * granularity cannot be mapped using block descriptors even
+                * if the pages belong to a THP for the process, because the
+                * stage-2 block descriptor will cover more than a single THP
+                * and we loose atomicity for unmapping, updates, and splits
+                * of the THP or other pages in the stage-2 block range.
+                */
+               if (vma->vm_start & ~PMD_MASK)
+                       force_pte = true;
+       }
+       up_read(&current->mm->mmap_sem);
+
        /* We need minimum second+third level pages */
        ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
        if (ret)
@@ -541,26 +674,40 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         */
        smp_rmb();
 
-       pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
+       pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
        if (is_error_pfn(pfn))
                return -EFAULT;
 
-       new_pte = pfn_pte(pfn, PAGE_S2);
-       coherent_icache_guest_page(vcpu->kvm, gfn);
-
-       spin_lock(&vcpu->kvm->mmu_lock);
-       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+       spin_lock(&kvm->mmu_lock);
+       if (mmu_notifier_retry(kvm, mmu_seq))
                goto out_unlock;
-       if (writable) {
-               kvm_set_s2pte_writable(&new_pte);
-               kvm_set_pfn_dirty(pfn);
+       if (!hugetlb && !force_pte)
+               hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
+
+       if (hugetlb) {
+               pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
+               new_pmd = pmd_mkhuge(new_pmd);
+               if (writable) {
+                       kvm_set_s2pmd_writable(&new_pmd);
+                       kvm_set_pfn_dirty(pfn);
+               }
+               coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
+               ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
+       } else {
+               pte_t new_pte = pfn_pte(pfn, PAGE_S2);
+               if (writable) {
+                       kvm_set_s2pte_writable(&new_pte);
+                       kvm_set_pfn_dirty(pfn);
+               }
+               coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
+               ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
        }
-       stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false);
+
 
 out_unlock:
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       spin_unlock(&kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return 0;
+       return ret;
 }
 
 /**
@@ -629,7 +776,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
        memslot = gfn_to_memslot(vcpu->kvm, gfn);
 
-       ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status);
+       ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
        if (ret == 0)
                ret = 1;
 out_unlock:
index 86a693a..0881bf1 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/kvm_host.h>
 #include <linux/wait.h>
 
+#include <asm/cputype.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_psci.h>
 
@@ -34,22 +35,30 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
 static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 {
        struct kvm *kvm = source_vcpu->kvm;
-       struct kvm_vcpu *vcpu;
+       struct kvm_vcpu *vcpu = NULL, *tmp;
        wait_queue_head_t *wq;
        unsigned long cpu_id;
+       unsigned long mpidr;
        phys_addr_t target_pc;
+       int i;
 
        cpu_id = *vcpu_reg(source_vcpu, 1);
        if (vcpu_mode_is_32bit(source_vcpu))
                cpu_id &= ~((u32) 0);
 
-       if (cpu_id >= atomic_read(&kvm->online_vcpus))
+       kvm_for_each_vcpu(i, tmp, kvm) {
+               mpidr = kvm_vcpu_get_mpidr(tmp);
+               if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) {
+                       vcpu = tmp;
+                       break;
+               }
+       }
+
+       if (!vcpu)
                return KVM_PSCI_RET_INVAL;
 
        target_pc = *vcpu_reg(source_vcpu, 2);
 
-       vcpu = kvm_get_vcpu(kvm, cpu_id);
-
        wq = kvm_arch_vcpu_wq(vcpu);
        if (!waitqueue_active(wq))
                return KVM_PSCI_RET_INVAL;
@@ -62,6 +71,10 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
                vcpu_set_thumb(vcpu);
        }
 
+       /* Propagate caller endianness */
+       if (kvm_vcpu_is_be(source_vcpu))
+               kvm_vcpu_set_be(vcpu);
+
        *vcpu_pc(vcpu) = target_pc;
        vcpu->arch.pause = false;
        smp_mb();               /* Make sure the above is visible */
index c02ba4a..f558c07 100644 (file)
 #include <kvm/arm_arch_timer.h>
 
 /******************************************************************************
- * Cortex-A15 Reset Values
+ * Cortex-A15 and Cortex-A7 Reset Values
  */
 
-static const int a15_max_cpu_idx = 3;
-
-static struct kvm_regs a15_regs_reset = {
+static struct kvm_regs cortexa_regs_reset = {
        .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
 };
 
-static const struct kvm_irq_level a15_vtimer_irq = {
+static const struct kvm_irq_level cortexa_vtimer_irq = {
        { .irq = 27 },
        .level = 1,
 };
@@ -62,12 +60,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        const struct kvm_irq_level *cpu_vtimer_irq;
 
        switch (vcpu->arch.target) {
+       case KVM_ARM_TARGET_CORTEX_A7:
        case KVM_ARM_TARGET_CORTEX_A15:
-               if (vcpu->vcpu_id > a15_max_cpu_idx)
-                       return -EINVAL;
-               reset_regs = &a15_regs_reset;
+               reset_regs = &cortexa_regs_reset;
                vcpu->arch.midr = read_cpuid_id();
-               cpu_vtimer_irq = &a15_vtimer_irq;
+               cpu_vtimer_irq = &cortexa_vtimer_irq;
                break;
        default:
                return -ENODEV;
index d7aa52e..bc47197 100644 (file)
@@ -114,7 +114,7 @@ static int do_dma_transfer(unsigned long apb_add,
        dma_desc->callback = apb_dma_complete;
        dma_desc->callback_param = NULL;
 
-       INIT_COMPLETION(tegra_apb_wait);
+       reinit_completion(&tegra_apb_wait);
 
        dmaengine_submit(dma_desc);
        dma_async_issue_pending(tegra_apb_dma_chan);
index 2a5907b..ff379ac 100644 (file)
@@ -65,7 +65,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address,
        return ret;
 }
 
-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
 /*
  * If we are using split PTE locks, then we need to take the page
  * lock here.  Otherwise we are using shared mm->page_table_lock
@@ -84,10 +84,10 @@ static inline void do_pte_unlock(spinlock_t *ptl)
 {
        spin_unlock(ptl);
 }
-#else /* !USE_SPLIT_PTLOCKS */
+#else /* !USE_SPLIT_PTE_PTLOCKS */
 static inline void do_pte_lock(spinlock_t *ptl) {}
 static inline void do_pte_unlock(spinlock_t *ptl) {}
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */
 
 static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
        unsigned long pfn)
index 4384103..1296952 100644 (file)
@@ -1 +1 @@
-obj-y          := enlighten.o hypercall.o grant-table.o
+obj-y          := enlighten.o hypercall.o grant-table.o p2m.o mm.o
diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
new file mode 100644 (file)
index 0000000..b0e77de
--- /dev/null
@@ -0,0 +1,65 @@
+#include <linux/bootmem.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+#include <linux/swiotlb.h>
+
+#include <xen/xen.h>
+#include <xen/interface/memory.h>
+#include <xen/swiotlb-xen.h>
+
+#include <asm/cacheflush.h>
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/interface.h>
+
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+                                unsigned int address_bits,
+                                dma_addr_t *dma_handle)
+{
+       if (!xen_initial_domain())
+               return -EINVAL;
+
+       /* we assume that dom0 is mapped 1:1 for now */
+       *dma_handle = pstart;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
+{
+       return;
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+struct dma_map_ops *xen_dma_ops;
+EXPORT_SYMBOL_GPL(xen_dma_ops);
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+       .mapping_error = xen_swiotlb_dma_mapping_error,
+       .alloc = xen_swiotlb_alloc_coherent,
+       .free = xen_swiotlb_free_coherent,
+       .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+       .sync_single_for_device = xen_swiotlb_sync_single_for_device,
+       .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+       .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+       .map_sg = xen_swiotlb_map_sg_attrs,
+       .unmap_sg = xen_swiotlb_unmap_sg_attrs,
+       .map_page = xen_swiotlb_map_page,
+       .unmap_page = xen_swiotlb_unmap_page,
+       .dma_supported = xen_swiotlb_dma_supported,
+       .set_dma_mask = xen_swiotlb_set_dma_mask,
+};
+
+int __init xen_mm_init(void)
+{
+       if (!xen_initial_domain())
+               return 0;
+       xen_swiotlb_init(1, false);
+       xen_dma_ops = &xen_swiotlb_dma_ops;
+       return 0;
+}
+arch_initcall(xen_mm_init);
diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c
new file mode 100644 (file)
index 0000000..23732cd
--- /dev/null
@@ -0,0 +1,208 @@
+#include <linux/bootmem.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/rwlock.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+#include <linux/swiotlb.h>
+
+#include <xen/xen.h>
+#include <xen/interface/memory.h>
+#include <xen/swiotlb-xen.h>
+
+#include <asm/cacheflush.h>
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/interface.h>
+
+struct xen_p2m_entry {
+       unsigned long pfn;
+       unsigned long mfn;
+       unsigned long nr_pages;
+       struct rb_node rbnode_mach;
+       struct rb_node rbnode_phys;
+};
+
+rwlock_t p2m_lock;
+struct rb_root phys_to_mach = RB_ROOT;
+static struct rb_root mach_to_phys = RB_ROOT;
+
+static int xen_add_phys_to_mach_entry(struct xen_p2m_entry *new)
+{
+       struct rb_node **link = &phys_to_mach.rb_node;
+       struct rb_node *parent = NULL;
+       struct xen_p2m_entry *entry;
+       int rc = 0;
+
+       while (*link) {
+               parent = *link;
+               entry = rb_entry(parent, struct xen_p2m_entry, rbnode_phys);
+
+               if (new->mfn == entry->mfn)
+                       goto err_out;
+               if (new->pfn == entry->pfn)
+                       goto err_out;
+
+               if (new->pfn < entry->pfn)
+                       link = &(*link)->rb_left;
+               else
+                       link = &(*link)->rb_right;
+       }
+       rb_link_node(&new->rbnode_phys, parent, link);
+       rb_insert_color(&new->rbnode_phys, &phys_to_mach);
+       goto out;
+
+err_out:
+       rc = -EINVAL;
+       pr_warn("%s: cannot add pfn=%pa -> mfn=%pa: pfn=%pa -> mfn=%pa already exists\n",
+                       __func__, &new->pfn, &new->mfn, &entry->pfn, &entry->mfn);
+out:
+       return rc;
+}
+
+unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+       struct rb_node *n = phys_to_mach.rb_node;
+       struct xen_p2m_entry *entry;
+       unsigned long irqflags;
+
+       read_lock_irqsave(&p2m_lock, irqflags);
+       while (n) {
+               entry = rb_entry(n, struct xen_p2m_entry, rbnode_phys);
+               if (entry->pfn <= pfn &&
+                               entry->pfn + entry->nr_pages > pfn) {
+                       read_unlock_irqrestore(&p2m_lock, irqflags);
+                       return entry->mfn + (pfn - entry->pfn);
+               }
+               if (pfn < entry->pfn)
+                       n = n->rb_left;
+               else
+                       n = n->rb_right;
+       }
+       read_unlock_irqrestore(&p2m_lock, irqflags);
+
+       return INVALID_P2M_ENTRY;
+}
+EXPORT_SYMBOL_GPL(__pfn_to_mfn);
+
+static int xen_add_mach_to_phys_entry(struct xen_p2m_entry *new)
+{
+       struct rb_node **link = &mach_to_phys.rb_node;
+       struct rb_node *parent = NULL;
+       struct xen_p2m_entry *entry;
+       int rc = 0;
+
+       while (*link) {
+               parent = *link;
+               entry = rb_entry(parent, struct xen_p2m_entry, rbnode_mach);
+
+               if (new->mfn == entry->mfn)
+                       goto err_out;
+               if (new->pfn == entry->pfn)
+                       goto err_out;
+
+               if (new->mfn < entry->mfn)
+                       link = &(*link)->rb_left;
+               else
+                       link = &(*link)->rb_right;
+       }
+       rb_link_node(&new->rbnode_mach, parent, link);
+       rb_insert_color(&new->rbnode_mach, &mach_to_phys);
+       goto out;
+
+err_out:
+       rc = -EINVAL;
+       pr_warn("%s: cannot add pfn=%pa -> mfn=%pa: pfn=%pa -> mfn=%pa already exists\n",
+                       __func__, &new->pfn, &new->mfn, &entry->pfn, &entry->mfn);
+out:
+       return rc;
+}
+
+unsigned long __mfn_to_pfn(unsigned long mfn)
+{
+       struct rb_node *n = mach_to_phys.rb_node;
+       struct xen_p2m_entry *entry;
+       unsigned long irqflags;
+
+       read_lock_irqsave(&p2m_lock, irqflags);
+       while (n) {
+               entry = rb_entry(n, struct xen_p2m_entry, rbnode_mach);
+               if (entry->mfn <= mfn &&
+                               entry->mfn + entry->nr_pages > mfn) {
+                       read_unlock_irqrestore(&p2m_lock, irqflags);
+                       return entry->pfn + (mfn - entry->mfn);
+               }
+               if (mfn < entry->mfn)
+                       n = n->rb_left;
+               else
+                       n = n->rb_right;
+       }
+       read_unlock_irqrestore(&p2m_lock, irqflags);
+
+       return INVALID_P2M_ENTRY;
+}
+EXPORT_SYMBOL_GPL(__mfn_to_pfn);
+
+bool __set_phys_to_machine_multi(unsigned long pfn,
+               unsigned long mfn, unsigned long nr_pages)
+{
+       int rc;
+       unsigned long irqflags;
+       struct xen_p2m_entry *p2m_entry;
+       struct rb_node *n = phys_to_mach.rb_node;
+
+       if (mfn == INVALID_P2M_ENTRY) {
+               write_lock_irqsave(&p2m_lock, irqflags);
+               while (n) {
+                       p2m_entry = rb_entry(n, struct xen_p2m_entry, rbnode_phys);
+                       if (p2m_entry->pfn <= pfn &&
+                                       p2m_entry->pfn + p2m_entry->nr_pages > pfn) {
+                               rb_erase(&p2m_entry->rbnode_mach, &mach_to_phys);
+                               rb_erase(&p2m_entry->rbnode_phys, &phys_to_mach);
+                               write_unlock_irqrestore(&p2m_lock, irqflags);
+                               kfree(p2m_entry);
+                               return true;
+                       }
+                       if (pfn < p2m_entry->pfn)
+                               n = n->rb_left;
+                       else
+                               n = n->rb_right;
+               }
+               write_unlock_irqrestore(&p2m_lock, irqflags);
+               return true;
+       }
+
+       p2m_entry = kzalloc(sizeof(struct xen_p2m_entry), GFP_NOWAIT);
+       if (!p2m_entry) {
+               pr_warn("cannot allocate xen_p2m_entry\n");
+               return false;
+       }
+       p2m_entry->pfn = pfn;
+       p2m_entry->nr_pages = nr_pages;
+       p2m_entry->mfn = mfn;
+
+       write_lock_irqsave(&p2m_lock, irqflags);
+       if ((rc = xen_add_phys_to_mach_entry(p2m_entry) < 0) ||
+               (rc = xen_add_mach_to_phys_entry(p2m_entry) < 0)) {
+               write_unlock_irqrestore(&p2m_lock, irqflags);
+               return false;
+       }
+       write_unlock_irqrestore(&p2m_lock, irqflags);
+       return true;
+}
+EXPORT_SYMBOL_GPL(__set_phys_to_machine_multi);
+
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+       return __set_phys_to_machine_multi(pfn, mfn, 1);
+}
+EXPORT_SYMBOL_GPL(__set_phys_to_machine);
+
+int p2m_init(void)
+{
+       rwlock_init(&p2m_lock);
+       return 0;
+}
+arch_initcall(p2m_init);
index bb0bf1b..88c8b6c 100644 (file)
@@ -143,7 +143,6 @@ config CPU_BIG_ENDIAN
 
 config SMP
        bool "Symmetric Multi-Processing"
-       select USE_GENERIC_SMP_HELPERS
        help
          This enables support for systems with more than one CPU.  If
          you say N here, the kernel will run on single and
@@ -221,6 +220,7 @@ config XEN_DOM0
 config XEN
        bool "Xen guest support on ARM64 (EXPERIMENTAL)"
        depends on ARM64 && OF
+       select SWIOTLB_XEN
        help
          Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
 
index 8d18100..fd0c0c0 100644 (file)
 
 #include <asm-generic/dma-coherent.h>
 
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+
 #define ARCH_HAS_DMA_GET_REQUIRED_MASK
 
+#define DMA_ERROR_CODE (~(dma_addr_t)0)
 extern struct dma_map_ops *dma_ops;
 
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
        if (unlikely(!dev) || !dev->archdata.dma_ops)
                return dma_ops;
@@ -35,6 +39,14 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
                return dev->archdata.dma_ops;
 }
 
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+{
+       if (xen_initial_domain())
+               return xen_dma_ops;
+       else
+               return __generic_dma_ops(dev);
+}
+
 #include <asm-generic/dma-mapping-common.h>
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
index b56e5b5..4cc813e 100644 (file)
 #ifdef __KERNEL__
 
 #include <linux/types.h>
+#include <linux/blk_types.h>
 
 #include <asm/byteorder.h>
 #include <asm/barrier.h>
 #include <asm/pgtable.h>
 
+#include <xen/xen.h>
+
 /*
  * Generic IO read/write.  These perform native-endian accesses.
  */
@@ -263,5 +266,12 @@ extern int devmem_is_allowed(unsigned long pfn);
  */
 #define xlate_dev_kmem_ptr(p)  p
 
+struct bio_vec;
+extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+                                     const struct bio_vec *vec2);
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                              \
+       (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&                         \
+        (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_IO_H */
index a5f28e2..c98ef47 100644 (file)
@@ -63,6 +63,7 @@
  * TAC:                Trap ACTLR
  * TSC:                Trap SMC
  * TSW:                Trap cache operations by set/way
+ * TWE:                Trap WFE
  * TWI:                Trap WFI
  * TIDCP:      Trap L2CTLR/L2ECTLR
  * BSU_IS:     Upgrade barriers to the inner shareable domain
@@ -72,8 +73,9 @@
  * FMO:                Override CPSR.F and enable signaling with VF
  * SWIO:       Turn set/way invalidates into set/way clean+invalidate
  */
-#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
-                        HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
+#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
+                        HCR_BSU_IS | HCR_FB | HCR_TAC | \
+                        HCR_AMO | HCR_IMO | HCR_FMO | \
                         HCR_SWIO | HCR_TIDCP | HCR_RW)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
 
 
 #define ESR_EL2_EC_xABT_xFSR_EXTABT    0x10
 
+#define ESR_EL2_EC_WFI_ISS_WFE (1 << 0)
+
 #endif /* __ARM64_KVM_ARM_H__ */
index eec0738..dd8ecfc 100644 (file)
@@ -177,4 +177,65 @@ static inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
        return kvm_vcpu_get_hsr(vcpu) & ESR_EL2_FSC_TYPE;
 }
 
+static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu)
+{
+       return vcpu_sys_reg(vcpu, MPIDR_EL1);
+}
+
+static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
+{
+       if (vcpu_mode_is_32bit(vcpu))
+               *vcpu_cpsr(vcpu) |= COMPAT_PSR_E_BIT;
+       else
+               vcpu_sys_reg(vcpu, SCTLR_EL1) |= (1 << 25);
+}
+
+static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu)
+{
+       if (vcpu_mode_is_32bit(vcpu))
+               return !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_E_BIT);
+
+       return !!(vcpu_sys_reg(vcpu, SCTLR_EL1) & (1 << 25));
+}
+
+static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu,
+                                                   unsigned long data,
+                                                   unsigned int len)
+{
+       if (kvm_vcpu_is_be(vcpu)) {
+               switch (len) {
+               case 1:
+                       return data & 0xff;
+               case 2:
+                       return be16_to_cpu(data & 0xffff);
+               case 4:
+                       return be32_to_cpu(data & 0xffffffff);
+               default:
+                       return be64_to_cpu(data);
+               }
+       }
+
+       return data;            /* Leave LE untouched */
+}
+
+static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
+                                                   unsigned long data,
+                                                   unsigned int len)
+{
+       if (kvm_vcpu_is_be(vcpu)) {
+               switch (len) {
+               case 1:
+                       return data & 0xff;
+               case 2:
+                       return cpu_to_be16(data & 0xffff);
+               case 4:
+                       return cpu_to_be32(data & 0xffffffff);
+               default:
+                       return cpu_to_be64(data);
+               }
+       }
+
+       return data;            /* Leave LE untouched */
+}
+
 #endif /* __ARM64_KVM_EMULATE_H__ */
index 0859a4d..5d85a02 100644 (file)
 
 #define KVM_VCPU_MAX_FEATURES 2
 
-/* We don't currently support large pages. */
-#define KVM_HPAGE_GFN_SHIFT(x) 0
-#define KVM_NR_PAGE_SIZES      1
-#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
-
 struct kvm_vcpu;
 int kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
@@ -151,6 +146,7 @@ struct kvm_vcpu_stat {
 struct kvm_vcpu_init;
 int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
                        const struct kvm_vcpu_init *init);
+int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
 struct kvm_one_reg;
index efe609c..680f74e 100644 (file)
@@ -91,6 +91,7 @@ int kvm_mmu_init(void);
 void kvm_clear_hyp_idmap(void);
 
 #define        kvm_set_pte(ptep, pte)          set_pte(ptep, pte)
+#define        kvm_set_pmd(pmdp, pmd)          set_pmd(pmdp, pmd)
 
 static inline bool kvm_is_write_fault(unsigned long esr)
 {
@@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
        pte_val(*pte) |= PTE_S2_RDWR;
 }
 
+static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+{
+       pmd_val(*pmd) |= PMD_S2_RDWR;
+}
+
 struct kvm;
 
-static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
+                                             unsigned long size)
 {
        if (!icache_is_aliasing()) {            /* PIPT */
-               unsigned long hva = gfn_to_hva(kvm, gfn);
-               flush_icache_range(hva, hva + PAGE_SIZE);
+               flush_icache_range(hva, hva + size);
        } else if (!icache_is_aivivt()) {       /* non ASID-tagged VIVT */
                /* any kind of VIPT cache */
                __flush_icache_all();
index f214069..9bea6e7 100644 (file)
@@ -63,9 +63,12 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr)
        struct page *pte;
 
        pte = alloc_pages(PGALLOC_GFP, 0);
-       if (pte)
-               pgtable_page_ctor(pte);
-
+       if (!pte)
+               return NULL;
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
index d57e668..755f861 100644 (file)
@@ -85,6 +85,8 @@
 #define PTE_S2_RDONLY          (_AT(pteval_t, 1) << 6)   /* HAP[2:1] */
 #define PTE_S2_RDWR            (_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
 
+#define PMD_S2_RDWR            (_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
+
 /*
  * Memory Attribute override for Stage-2 (MemAttr[3:0])
  */
diff --git a/arch/arm64/include/asm/xen/page-coherent.h b/arch/arm64/include/asm/xen/page-coherent.h
new file mode 100644 (file)
index 0000000..2820f1a
--- /dev/null
@@ -0,0 +1,47 @@
+#ifndef _ASM_ARM64_XEN_PAGE_COHERENT_H
+#define _ASM_ARM64_XEN_PAGE_COHERENT_H
+
+#include <asm/page.h>
+#include <linux/dma-attrs.h>
+#include <linux/dma-mapping.h>
+
+static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size,
+               dma_addr_t *dma_handle, gfp_t flags,
+               struct dma_attrs *attrs)
+{
+       return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs);
+}
+
+static inline void xen_free_coherent_pages(struct device *hwdev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle,
+               struct dma_attrs *attrs)
+{
+       __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs);
+}
+
+static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
+            unsigned long offset, size_t size, enum dma_data_direction dir,
+            struct dma_attrs *attrs)
+{
+       __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs);
+}
+
+static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
+               size_t size, enum dma_data_direction dir,
+               struct dma_attrs *attrs)
+{
+       __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs);
+}
+
+static inline void xen_dma_sync_single_for_cpu(struct device *hwdev,
+               dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+       __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir);
+}
+
+static inline void xen_dma_sync_single_for_device(struct device *hwdev,
+               dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+       __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir);
+}
+#endif /* _ASM_ARM64_XEN_PAGE_COHERENT_H */
index 21e9082..4480ab3 100644 (file)
@@ -21,6 +21,7 @@ config KVM
        select MMU_NOTIFIER
        select PREEMPT_NOTIFIERS
        select ANON_INODES
+       select HAVE_KVM_CPU_RELAX_INTERCEPT
        select KVM_MMIO
        select KVM_ARM_HOST
        select KVM_ARM_VGIC
index 2c3ff67..3f0731e 100644 (file)
@@ -248,6 +248,26 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
        return kvm_reset_vcpu(vcpu);
 }
 
+int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
+{
+       int target = kvm_target_cpu();
+
+       if (target < 0)
+               return -ENODEV;
+
+       memset(init, 0, sizeof(*init));
+
+       /*
+        * For now, we don't return any features.
+        * In future, we might use features to return target
+        * specific features available for the preferred
+        * target type.
+        */
+       init->target = (__u32)target;
+
+       return 0;
+}
+
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        return -EINVAL;
index 9beaca0..8da5606 100644 (file)
@@ -47,21 +47,29 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 }
 
 /**
- * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest
+ * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event
+ *                 instruction executed by a guest
+ *
  * @vcpu:      the vcpu pointer
  *
- * Simply call kvm_vcpu_block(), which will halt execution of
+ * WFE: Yield the CPU and come back to this vcpu when the scheduler
+ * decides to.
+ * WFI: Simply call kvm_vcpu_block(), which will halt execution of
  * world-switches and schedule other host processes until there is an
  * incoming IRQ or FIQ to the VM.
  */
-static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run)
+static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-       kvm_vcpu_block(vcpu);
+       if (kvm_vcpu_get_hsr(vcpu) & ESR_EL2_EC_WFI_ISS_WFE)
+               kvm_vcpu_on_spin(vcpu);
+       else
+               kvm_vcpu_block(vcpu);
+
        return 1;
 }
 
 static exit_handle_fn arm_exit_handlers[] = {
-       [ESR_EL2_EC_WFI]        = kvm_handle_wfi,
+       [ESR_EL2_EC_WFI]        = kvm_handle_wfx,
        [ESR_EL2_EC_CP15_32]    = kvm_handle_cp15_32,
        [ESR_EL2_EC_CP15_64]    = kvm_handle_cp15_64,
        [ESR_EL2_EC_CP14_MR]    = kvm_handle_cp14_access,
index be24040..74a8d87 100644 (file)
@@ -1,2 +1,2 @@
-xen-arm-y      += $(addprefix ../../arm/xen/, enlighten.o grant-table.o)
+xen-arm-y      += $(addprefix ../../arm/xen/, enlighten.o grant-table.o p2m.o mm.o)
 obj-y          := xen-arm.o hypercall.o
index bc7e8ae..1aba19d 100644 (file)
@@ -68,7 +68,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
                return NULL;
 
        page = virt_to_page(pg);
-       pgtable_page_ctor(page);
+       if (!pgtable_page_ctor(page)) {
+               quicklist_free(QUICK_PT, NULL, pg);
+               return NULL;
+       }
 
        return page;
 }
index e887b57..9ceccef 100644 (file)
@@ -34,7 +34,6 @@ config BLACKFIN
        select ARCH_WANT_IPC_PARSE_VERSION
        select GENERIC_ATOMIC64
        select GENERIC_IRQ_PROBE
-       select USE_GENERIC_SMP_HELPERS if SMP
        select HAVE_NMI_WATCHDOG if NMI_WATCHDOG
        select GENERIC_SMP_IDLE_THREAD
        select ARCH_USES_GETTIMEOFFSET if !GENERIC_CLOCKEVENTS
index 6da975d..235ece4 100644 (file)
@@ -32,7 +32,12 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addres
 {
        struct page *pte;
        pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-       pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
index f6084bc..41907d2 100644 (file)
@@ -37,11 +37,15 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 #else
        page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
 #endif
-       if (page) {
-               clear_highpage(page);
-               pgtable_page_ctor(page);
-               flush_dcache_page(page);
+       if (!page)
+               return NULL;
+
+       clear_highpage(page);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
        }
+       flush_dcache_page(page);
        return page;
 }
 
index 99041b0..09df260 100644 (file)
@@ -4,7 +4,6 @@ comment "Linux Kernel Configuration for Hexagon"
 config HEXAGON
        def_bool y
        select HAVE_OPROFILE
-       select USE_GENERIC_SMP_HELPERS if SMP
        # Other pending projects/to-do items.
        # select HAVE_REGS_AND_STACK_ACCESS_API
        # select HAVE_HW_BREAKPOINT if PERF_EVENTS
index 679bf6d..4c9d382 100644 (file)
@@ -65,10 +65,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
        struct page *pte;
 
        pte = alloc_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
-
-       if (pte)
-               pgtable_page_ctor(pte);
-
+       if (!pte)
+               return NULL;
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
index 7740ab1..dfe85e9 100644 (file)
@@ -343,7 +343,6 @@ config FORCE_MAX_ZONEORDER
 
 config SMP
        bool "Symmetric multi-processing support"
-       select USE_GENERIC_SMP_HELPERS
        help
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, say N.  If you have a system with more
index 989dd3f..db95f57 100644 (file)
@@ -234,10 +234,6 @@ struct kvm_vm_data {
 #define KVM_REQ_PTC_G          32
 #define KVM_REQ_RESUME         33
 
-#define KVM_HPAGE_GFN_SHIFT(x) 0
-#define KVM_NR_PAGE_SIZES      1
-#define KVM_PAGES_PER_HPAGE(x) 1
-
 struct kvm;
 struct kvm_vcpu;
 
@@ -480,7 +476,7 @@ struct kvm_arch {
 
        struct list_head assigned_dev_head;
        struct iommu_domain *iommu_domain;
-       int iommu_flags;
+       bool iommu_noncoherent;
 
        unsigned long irq_sources_bitmap;
        unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
index 96a8d92..5767cdf 100644 (file)
@@ -91,7 +91,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr)
        if (!pg)
                return NULL;
        page = virt_to_page(pg);
-       pgtable_page_ctor(page);
+       if (!pgtable_page_ctor(page)) {
+               quicklist_free(0, NULL, pg);
+               return NULL;
+       }
        return page;
 }
 
diff --git a/arch/ia64/include/asm/xen/page-coherent.h b/arch/ia64/include/asm/xen/page-coherent.h
new file mode 100644 (file)
index 0000000..96e42f9
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef _ASM_IA64_XEN_PAGE_COHERENT_H
+#define _ASM_IA64_XEN_PAGE_COHERENT_H
+
+#include <asm/page.h>
+#include <linux/dma-attrs.h>
+#include <linux/dma-mapping.h>
+
+static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size,
+               dma_addr_t *dma_handle, gfp_t flags,
+               struct dma_attrs *attrs)
+{
+       void *vstart = (void*)__get_free_pages(flags, get_order(size));
+       *dma_handle = virt_to_phys(vstart);
+       return vstart;
+}
+
+static inline void xen_free_coherent_pages(struct device *hwdev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle,
+               struct dma_attrs *attrs)
+{
+       free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
+            unsigned long offset, size_t size, enum dma_data_direction dir,
+            struct dma_attrs *attrs) { }
+
+static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
+               size_t size, enum dma_data_direction dir,
+               struct dma_attrs *attrs) { }
+
+static inline void xen_dma_sync_single_for_cpu(struct device *hwdev,
+               dma_addr_t handle, size_t size, enum dma_data_direction dir) { }
+
+static inline void xen_dma_sync_single_for_device(struct device *hwdev,
+               dma_addr_t handle, size_t size, enum dma_data_direction dir) { }
+
+#endif /* _ASM_IA64_XEN_PAGE_COHERENT_H */
index bdfd878..985bf80 100644 (file)
@@ -1550,12 +1550,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                           unsigned long npages)
 {
        return 0;
 }
index 75661fb..09ef94a 100644 (file)
@@ -275,7 +275,6 @@ source "kernel/Kconfig.preempt"
 
 config SMP
        bool "Symmetric multi-processing support"
-       select USE_GENERIC_SMP_HELPERS
        ---help---
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, like most personal computers, say N. If
index 0fc7361..2d55a06 100644 (file)
@@ -43,7 +43,12 @@ static __inline__ pgtable_t pte_alloc_one(struct mm_struct *mm,
 {
        struct page *pte = alloc_page(GFP_KERNEL|__GFP_ZERO);
 
-       pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
index 313f3dd..f9924fb 100644 (file)
@@ -56,6 +56,10 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 
        if (!page)
                return NULL;
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
 
        pte = kmap(page);
        if (pte) {
index 2f02f26..24bcba4 100644 (file)
@@ -29,18 +29,22 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 
 static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-       struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+       struct page *page;
        pte_t *pte;
 
+       page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
        if(!page)
                return NULL;
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
 
        pte = kmap(page);
        __flush_page_to_ram(pte);
        flush_tlb_kernel_page(pte);
        nocache_page(pte);
        kunmap(page);
-       pgtable_page_ctor(page);
        return page;
 }
 
index 48d80d5..f868506 100644 (file)
@@ -59,7 +59,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
                return NULL;
 
        clear_highpage(page);
-       pgtable_page_ctor(page);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
        return page;
 
 }
index 36368eb..e56abd2 100644 (file)
@@ -111,7 +111,6 @@ config METAG_META21
 config SMP
        bool "Symmetric multi-processing support"
        depends on METAG_META21 && METAG_META21_MMU
-       select USE_GENERIC_SMP_HELPERS
        help
          This enables support for systems with more than one thread running
          Linux. If you have a system with only one thread running Linux,
index 275d928..3104df0 100644 (file)
@@ -52,8 +52,12 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
 {
        struct page *pte;
        pte = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0);
-       if (pte)
-               pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
index ebd3579..7fdf7fa 100644 (file)
@@ -122,8 +122,13 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 #endif
 
        ptepage = alloc_pages(flags, 0);
-       if (ptepage)
-               clear_highpage(ptepage);
+       if (!ptepage)
+               return NULL;
+       clear_highpage(ptepage);
+       if (!pgtable_page_ctor(ptepage)) {
+               __free_page(ptepage);
+               return NULL;
+       }
        return ptepage;
 }
 
@@ -158,8 +163,9 @@ extern inline void pte_free_slow(struct page *ptepage)
        __free_page(ptepage);
 }
 
-extern inline void pte_free(struct mm_struct *mm, struct page *ptepage)
+static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
 {
+       pgtable_page_dtor(ptepage);
        __free_page(ptepage);
 }
 
index 17cc7ff..867d7db 100644 (file)
@@ -2125,7 +2125,6 @@ source "mm/Kconfig"
 config SMP
        bool "Multi-Processing support"
        depends on SYS_SUPPORTS_SMP
-       select USE_GENERIC_SMP_HELPERS
        help
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, like most personal computers, say N. If
index 4d6fa0b..3296696 100644 (file)
 
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
-/* Don't support huge pages */
-#define KVM_HPAGE_GFN_SHIFT(x) 0
-
-/* We don't currently support large pages. */
-#define KVM_NR_PAGE_SIZES      1
-#define KVM_PAGES_PER_HPAGE(x) 1
-
 
 
 /* Special address that contains the comm page, used for reducing # of traps */
index 881d18b..b336037 100644 (file)
@@ -80,9 +80,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
        struct page *pte;
 
        pte = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER);
-       if (pte) {
-               clear_highpage(pte);
-               pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       clear_highpage(pte);
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
        }
        return pte;
 }
index a7b0445..73b3482 100644 (file)
@@ -198,12 +198,13 @@ kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
        return -ENOIOCTLCMD;
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                           unsigned long npages)
 {
        return 0;
 }
index 6aaa160..8bde923 100644 (file)
@@ -181,7 +181,6 @@ endmenu
 config SMP
        bool "Symmetric multi-processing support"
        default y
-       select USE_GENERIC_SMP_HELPERS
        depends on MN10300_PROC_MN2WS0038 || MN10300_PROC_MN2WS0050
        ---help---
          This enables support for systems with more than one CPU. If you have
index 146bacf..0f25d5f 100644 (file)
@@ -46,6 +46,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 
 static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
+       pgtable_page_dtor(pte);
        __free_page(pte);
 }
 
index bd9ada6..e77a7c7 100644 (file)
@@ -78,8 +78,13 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 #else
        pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
 #endif
-       if (pte)
-               clear_highpage(pte);
+       if (!pte)
+               return NULL;
+       clear_highpage(pte);
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
index 05c39ec..21484e5 100644 (file)
@@ -78,8 +78,13 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 {
        struct page *pte;
        pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
-       if (pte)
-               clear_page(page_address(pte));
+       if (!pte)
+               return NULL;
+       clear_page(page_address(pte));
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
@@ -90,6 +95,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 
 static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
+       pgtable_page_dtor(pte);
        __free_page(pte);
 }
 
index 7dcde53..c03567a 100644 (file)
@@ -226,7 +226,6 @@ endchoice
 
 config SMP
        bool "Symmetric multi-processing support"
-       select USE_GENERIC_SMP_HELPERS
        ---help---
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, like most personal computers, say N. If
index fc987a1..f213f5b 100644 (file)
@@ -121,8 +121,12 @@ static inline pgtable_t
 pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
        struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-       if (page)
-               pgtable_page_ctor(page);
+       if (!page)
+               return NULL;
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
        return page;
 }
 
index 2f898d6..4740b0a 100644 (file)
@@ -106,7 +106,6 @@ config PPC
        select HAVE_MEMBLOCK_NODE_MAP
        select HAVE_DMA_ATTRS
        select HAVE_DMA_API_DEBUG
-       select USE_GENERIC_SMP_HELPERS if SMP
        select HAVE_OPROFILE
        select HAVE_DEBUG_KMEMLEAK
        select GENERIC_ATOMIC64 if PPC32
index 9b198d1..856f8de 100644 (file)
@@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst)
        return inst & 0xffff;
 }
 
+static inline unsigned int get_oc(u32 inst)
+{
+       return (inst >> 11) & 0x7fff;
+}
 #endif /* __ASM_PPC_DISASSEMBLE_H__ */
index cca12f0..894662a 100644 (file)
@@ -198,12 +198,27 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
        cmpwi   r10,0;                                                  \
        bne     do_kvm_##n
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+/*
+ * If hv is possible, interrupts come into to the hv version
+ * of the kvmppc_interrupt code, which then jumps to the PR handler,
+ * kvmppc_interrupt_pr, if the guest is a PR guest.
+ */
+#define kvmppc_interrupt kvmppc_interrupt_hv
+#else
+#define kvmppc_interrupt kvmppc_interrupt_pr
+#endif
+
 #define __KVM_HANDLER(area, h, n)                                      \
 do_kvm_##n:                                                            \
        BEGIN_FTR_SECTION_NESTED(947)                                   \
        ld      r10,area+EX_CFAR(r13);                                  \
        std     r10,HSTATE_CFAR(r13);                                   \
        END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947);          \
+       BEGIN_FTR_SECTION_NESTED(948)                                   \
+       ld      r10,area+EX_PPR(r13);                                   \
+       std     r10,HSTATE_PPR(r13);                                    \
+       END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);    \
        ld      r10,area+EX_R10(r13);                                   \
        stw     r9,HSTATE_SCRATCH1(r13);                                \
        ld      r9,area+EX_R9(r13);                                     \
@@ -217,6 +232,10 @@ do_kvm_##n:                                                                \
        ld      r10,area+EX_R10(r13);                                   \
        beq     89f;                                                    \
        stw     r9,HSTATE_SCRATCH1(r13);                        \
+       BEGIN_FTR_SECTION_NESTED(948)                                   \
+       ld      r9,area+EX_PPR(r13);                                    \
+       std     r9,HSTATE_PPR(r13);                                     \
+       END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);    \
        ld      r9,area+EX_R9(r13);                                     \
        std     r12,HSTATE_SCRATCH0(r13);                       \
        li      r12,n;                                                  \
@@ -236,7 +255,7 @@ do_kvm_##n:                                                         \
 #define KVM_HANDLER_SKIP(area, h, n)
 #endif
 
-#ifdef CONFIG_KVM_BOOK3S_PR
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 #define KVMTEST_PR(n)                  __KVMTEST(n)
 #define KVM_HANDLER_PR(area, h, n)     __KVM_HANDLER(area, h, n)
 #define KVM_HANDLER_PR_SKIP(area, h, n)        __KVM_HANDLER_SKIP(area, h, n)
index 851bac7..1bd92fd 100644 (file)
 #define BOOK3S_HFLAG_SLB                       0x2
 #define BOOK3S_HFLAG_PAIRED_SINGLE             0x4
 #define BOOK3S_HFLAG_NATIVE_PS                 0x8
+#define BOOK3S_HFLAG_MULTI_PGSIZE              0x10
+#define BOOK3S_HFLAG_NEW_TLBIE                 0x20
 
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
 #define KVM_GUEST_MODE_NONE    0
 #define KVM_GUEST_MODE_GUEST   1
 #define KVM_GUEST_MODE_SKIP    2
+#define KVM_GUEST_MODE_GUEST_HV        3
+#define KVM_GUEST_MODE_HOST_HV 4
 
 #define KVM_INST_FETCH_FAILED  -1
 
index fa19e2f..4a594b7 100644 (file)
@@ -58,16 +58,18 @@ struct hpte_cache {
        struct hlist_node list_pte_long;
        struct hlist_node list_vpte;
        struct hlist_node list_vpte_long;
+#ifdef CONFIG_PPC_BOOK3S_64
+       struct hlist_node list_vpte_64k;
+#endif
        struct rcu_head rcu_head;
        u64 host_vpn;
        u64 pfn;
        ulong slot;
        struct kvmppc_pte pte;
+       int pagesize;
 };
 
 struct kvmppc_vcpu_book3s {
-       struct kvm_vcpu vcpu;
-       struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
        struct kvmppc_sid_map sid_map[SID_MAP_NUM];
        struct {
                u64 esid;
@@ -99,6 +101,9 @@ struct kvmppc_vcpu_book3s {
        struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
        struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
        struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
+#ifdef CONFIG_PPC_BOOK3S_64
+       struct hlist_head hpte_hash_vpte_64k[HPTEG_HASH_NUM_VPTE_64K];
+#endif
        int hpte_cache_count;
        spinlock_t mmu_lock;
 };
@@ -107,8 +112,9 @@ struct kvmppc_vcpu_book3s {
 #define CONTEXT_GUEST          1
 #define CONTEXT_GUEST_END      2
 
-#define VSID_REAL      0x0fffffffffc00000ULL
-#define VSID_BAT       0x0fffffffffb00000ULL
+#define VSID_REAL      0x07ffffffffc00000ULL
+#define VSID_BAT       0x07ffffffffb00000ULL
+#define VSID_64K       0x0800000000000000ULL
 #define VSID_1T                0x1000000000000000ULL
 #define VSID_REAL_DR   0x2000000000000000ULL
 #define VSID_REAL_IR   0x4000000000000000ULL
@@ -118,11 +124,12 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask)
 extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
 extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
 extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
-extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
-extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
+extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte,
+                              bool iswrite);
+extern void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong seg_size);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -134,6 +141,7 @@ extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
 
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte);
 extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
@@ -151,7 +159,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
                           bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
-extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing,
+                       bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
                        unsigned long *rmap, long pte_index, int realmode);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
@@ -172,6 +181,8 @@ extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
                        unsigned long *hpret);
 extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
                        struct kvm_memory_slot *memslot, unsigned long *map);
+extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr,
+                       unsigned long mask);
 
 extern void kvmppc_entry_trampoline(void);
 extern void kvmppc_hv_entry_trampoline(void);
@@ -184,11 +195,9 @@ extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 {
-       return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
+       return vcpu->arch.book3s;
 }
 
-extern void kvm_return_point(void);
-
 /* Also add subarch specific defines */
 
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
@@ -198,203 +207,6 @@ extern void kvm_return_point(void);
 #include <asm/kvm_book3s_64.h>
 #endif
 
-#ifdef CONFIG_KVM_BOOK3S_PR
-
-static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
-{
-       return to_book3s(vcpu)->hior;
-}
-
-static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
-                       unsigned long pending_now, unsigned long old_pending)
-{
-       if (pending_now)
-               vcpu->arch.shared->int_pending = 1;
-       else if (old_pending)
-               vcpu->arch.shared->int_pending = 0;
-}
-
-static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
-{
-       if ( num < 14 ) {
-               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-               svcpu->gpr[num] = val;
-               svcpu_put(svcpu);
-               to_book3s(vcpu)->shadow_vcpu->gpr[num] = val;
-       } else
-               vcpu->arch.gpr[num] = val;
-}
-
-static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
-{
-       if ( num < 14 ) {
-               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-               ulong r = svcpu->gpr[num];
-               svcpu_put(svcpu);
-               return r;
-       } else
-               return vcpu->arch.gpr[num];
-}
-
-static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       svcpu->cr = val;
-       svcpu_put(svcpu);
-       to_book3s(vcpu)->shadow_vcpu->cr = val;
-}
-
-static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       u32 r;
-       r = svcpu->cr;
-       svcpu_put(svcpu);
-       return r;
-}
-
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       svcpu->xer = val;
-       to_book3s(vcpu)->shadow_vcpu->xer = val;
-       svcpu_put(svcpu);
-}
-
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       u32 r;
-       r = svcpu->xer;
-       svcpu_put(svcpu);
-       return r;
-}
-
-static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       svcpu->ctr = val;
-       svcpu_put(svcpu);
-}
-
-static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       ulong r;
-       r = svcpu->ctr;
-       svcpu_put(svcpu);
-       return r;
-}
-
-static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       svcpu->lr = val;
-       svcpu_put(svcpu);
-}
-
-static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       ulong r;
-       r = svcpu->lr;
-       svcpu_put(svcpu);
-       return r;
-}
-
-static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       svcpu->pc = val;
-       svcpu_put(svcpu);
-}
-
-static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       ulong r;
-       r = svcpu->pc;
-       svcpu_put(svcpu);
-       return r;
-}
-
-static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
-{
-       ulong pc = kvmppc_get_pc(vcpu);
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       u32 r;
-
-       /* Load the instruction manually if it failed to do so in the
-        * exit path */
-       if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
-               kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
-
-       r = svcpu->last_inst;
-       svcpu_put(svcpu);
-       return r;
-}
-
-/*
- * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
- * Because the sc instruction sets SRR0 to point to the following
- * instruction, we have to fetch from pc - 4.
- */
-static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
-{
-       ulong pc = kvmppc_get_pc(vcpu) - 4;
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       u32 r;
-
-       /* Load the instruction manually if it failed to do so in the
-        * exit path */
-       if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
-               kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
-
-       r = svcpu->last_inst;
-       svcpu_put(svcpu);
-       return r;
-}
-
-static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
-{
-       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-       ulong r;
-       r = svcpu->fault_dar;
-       svcpu_put(svcpu);
-       return r;
-}
-
-static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
-{
-       ulong crit_raw = vcpu->arch.shared->critical;
-       ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
-       bool crit;
-
-       /* Truncate crit indicators in 32 bit mode */
-       if (!(vcpu->arch.shared->msr & MSR_SF)) {
-               crit_raw &= 0xffffffff;
-               crit_r1 &= 0xffffffff;
-       }
-
-       /* Critical section when crit == r1 */
-       crit = (crit_raw == crit_r1);
-       /* ... and we're in supervisor mode */
-       crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
-
-       return crit;
-}
-#else /* CONFIG_KVM_BOOK3S_PR */
-
-static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
-{
-       return 0;
-}
-
-static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
-                       unsigned long pending_now, unsigned long old_pending)
-{
-}
-
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
        vcpu->arch.gpr[num] = val;
@@ -489,12 +301,6 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
        return vcpu->arch.fault_dar;
 }
 
-static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
-{
-       return false;
-}
-#endif
-
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3                        0x113724FA
index ce0ef6c..c720e0b 100644 (file)
@@ -22,7 +22,7 @@
 
 static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
-       return to_book3s(vcpu)->shadow_vcpu;
+       return vcpu->arch.shadow_vcpu;
 }
 
 static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
index 86d638a..bf0fa8b 100644 (file)
@@ -20,7 +20,7 @@
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
-#ifdef CONFIG_KVM_BOOK3S_PR
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
        preempt_disable();
@@ -35,7 +35,7 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 
 #define SPAPR_TCE_SHIFT                12
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #define KVM_DEFAULT_HPT_ORDER  24      /* 16MB HPT by default */
 extern unsigned long kvm_rma_pages;
 #endif
@@ -278,7 +278,7 @@ static inline int is_vrma_hpte(unsigned long hpte_v)
                (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
 }
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 /*
  * Note modification of an HPTE; set the HPTE modified bit
  * if anyone is interested.
@@ -289,6 +289,6 @@ static inline void note_hpte_modification(struct kvm *kvm,
        if (atomic_read(&kvm->arch.hpte_mod_interest))
                rev->guest_rpte |= HPTE_GR_MODIFIED;
 }
-#endif /* CONFIG_KVM_BOOK3S_64_HV */
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
index 9039d3c..0bd9348 100644 (file)
@@ -83,7 +83,7 @@ struct kvmppc_host_state {
        u8 restore_hid5;
        u8 napping;
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        u8 hwthread_req;
        u8 hwthread_state;
        u8 host_ipi;
@@ -101,6 +101,7 @@ struct kvmppc_host_state {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
        u64 cfar;
+       u64 ppr;
 #endif
 };
 
@@ -108,14 +109,14 @@ struct kvmppc_book3s_shadow_vcpu {
        ulong gpr[14];
        u32 cr;
        u32 xer;
-
-       u32 fault_dsisr;
-       u32 last_inst;
        ulong ctr;
        ulong lr;
        ulong pc;
+
        ulong shadow_srr1;
        ulong fault_dar;
+       u32 fault_dsisr;
+       u32 last_inst;
 
 #ifdef CONFIG_PPC_BOOK3S_32
        u32     sr[16];                 /* Guest SRs */
index d3c1eb3..dd8f615 100644 (file)
 /* LPIDs we support with this build -- runtime limit may be lower */
 #define KVMPPC_NR_LPIDS                        64
 
-#define KVMPPC_INST_EHPRIV     0x7c00021c
+#define KVMPPC_INST_EHPRIV             0x7c00021c
+#define EHPRIV_OC_SHIFT                        11
+/* "ehpriv 1" : ehpriv with OC = 1 is used for debug emulation */
+#define EHPRIV_OC_DEBUG                        1
+#define KVMPPC_INST_EHPRIV_DEBUG       (KVMPPC_INST_EHPRIV | \
+                                        (EHPRIV_OC_DEBUG << EHPRIV_OC_SHIFT))
 
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
index 3328353..237d1d2 100644 (file)
@@ -63,20 +63,17 @@ extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 
 #endif
 
-/* We don't currently support large pages. */
-#define KVM_HPAGE_GFN_SHIFT(x) 0
-#define KVM_NR_PAGE_SIZES      1
-#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
-
 #define HPTEG_CACHE_NUM                        (1 << 15)
 #define HPTEG_HASH_BITS_PTE            13
 #define HPTEG_HASH_BITS_PTE_LONG       12
 #define HPTEG_HASH_BITS_VPTE           13
 #define HPTEG_HASH_BITS_VPTE_LONG      5
+#define HPTEG_HASH_BITS_VPTE_64K       11
 #define HPTEG_HASH_NUM_PTE             (1 << HPTEG_HASH_BITS_PTE)
 #define HPTEG_HASH_NUM_PTE_LONG                (1 << HPTEG_HASH_BITS_PTE_LONG)
 #define HPTEG_HASH_NUM_VPTE            (1 << HPTEG_HASH_BITS_VPTE)
 #define HPTEG_HASH_NUM_VPTE_LONG       (1 << HPTEG_HASH_BITS_VPTE_LONG)
+#define HPTEG_HASH_NUM_VPTE_64K                (1 << HPTEG_HASH_BITS_VPTE_64K)
 
 /* Physical Address Mask - allowed range of real mode RAM access */
 #define KVM_PAM                        0x0fffffffffffffffULL
@@ -89,6 +86,9 @@ struct lppaca;
 struct slb_shadow;
 struct dtl_entry;
 
+struct kvmppc_vcpu_book3s;
+struct kvmppc_book3s_shadow_vcpu;
+
 struct kvm_vm_stat {
        u32 remote_tlb_flush;
 };
@@ -224,15 +224,15 @@ struct revmap_entry {
 #define KVMPPC_GOT_PAGE                0x80
 
 struct kvm_arch_memory_slot {
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        unsigned long *rmap;
        unsigned long *slot_phys;
-#endif /* CONFIG_KVM_BOOK3S_64_HV */
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 };
 
 struct kvm_arch {
        unsigned int lpid;
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        unsigned long hpt_virt;
        struct revmap_entry *revmap;
        unsigned int host_lpid;
@@ -256,7 +256,10 @@ struct kvm_arch {
        cpumask_t need_tlb_flush;
        struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
        int hpt_cma_alloc;
-#endif /* CONFIG_KVM_BOOK3S_64_HV */
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+       struct mutex hpt_mutex;
+#endif
 #ifdef CONFIG_PPC_BOOK3S_64
        struct list_head spapr_tce_tables;
        struct list_head rtas_tokens;
@@ -267,6 +270,7 @@ struct kvm_arch {
 #ifdef CONFIG_KVM_XICS
        struct kvmppc_xics *xics;
 #endif
+       struct kvmppc_ops *kvm_ops;
 };
 
 /*
@@ -294,6 +298,10 @@ struct kvmppc_vcore {
        u64 stolen_tb;
        u64 preempt_tb;
        struct kvm_vcpu *runner;
+       u64 tb_offset;          /* guest timebase - host timebase */
+       ulong lpcr;
+       u32 arch_compat;
+       ulong pcr;
 };
 
 #define VCORE_ENTRY_COUNT(vc)  ((vc)->entry_exit_count & 0xff)
@@ -328,6 +336,7 @@ struct kvmppc_pte {
        bool may_read           : 1;
        bool may_write          : 1;
        bool may_execute        : 1;
+       u8 page_size;           /* MMU_PAGE_xxx */
 };
 
 struct kvmppc_mmu {
@@ -340,7 +349,8 @@ struct kvmppc_mmu {
        /* book3s */
        void (*mtsrin)(struct kvm_vcpu *vcpu, u32 srnum, ulong value);
        u32  (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum);
-       int  (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data);
+       int  (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr,
+                     struct kvmppc_pte *pte, bool data, bool iswrite);
        void (*reset_msr)(struct kvm_vcpu *vcpu);
        void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large);
        int  (*esid_to_vsid)(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid);
@@ -360,6 +370,7 @@ struct kvmppc_slb {
        bool large      : 1;    /* PTEs are 16MB */
        bool tb         : 1;    /* 1TB segment */
        bool class      : 1;
+       u8 base_page_size;      /* MMU_PAGE_xxx */
 };
 
 # ifdef CONFIG_PPC_FSL_BOOK3E
@@ -377,17 +388,6 @@ struct kvmppc_slb {
 #define KVMPPC_EPR_USER                1 /* exit to userspace to fill EPR */
 #define KVMPPC_EPR_KERNEL      2 /* in-kernel irqchip */
 
-struct kvmppc_booke_debug_reg {
-       u32 dbcr0;
-       u32 dbcr1;
-       u32 dbcr2;
-#ifdef CONFIG_KVM_E500MC
-       u32 dbcr4;
-#endif
-       u64 iac[KVMPPC_BOOKE_MAX_IAC];
-       u64 dac[KVMPPC_BOOKE_MAX_DAC];
-};
-
 #define KVMPPC_IRQ_DEFAULT     0
 #define KVMPPC_IRQ_MPIC                1
 #define KVMPPC_IRQ_XICS                2
@@ -402,6 +402,10 @@ struct kvm_vcpu_arch {
        int slb_max;            /* 1 + index of last valid entry in slb[] */
        int slb_nr;             /* total number of entries in SLB */
        struct kvmppc_mmu mmu;
+       struct kvmppc_vcpu_book3s *book3s;
+#endif
+#ifdef CONFIG_PPC_BOOK3S_32
+       struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
 #endif
 
        ulong gpr[32];
@@ -463,6 +467,8 @@ struct kvm_vcpu_arch {
        u32 ctrl;
        ulong dabr;
        ulong cfar;
+       ulong ppr;
+       ulong shadow_srr1;
 #endif
        u32 vrsave; /* also USPRG0 */
        u32 mmucr;
@@ -498,6 +504,8 @@ struct kvm_vcpu_arch {
 
        u64 mmcr[3];
        u32 pmc[8];
+       u64 siar;
+       u64 sdar;
 
 #ifdef CONFIG_KVM_EXIT_TIMING
        struct mutex exit_timing_lock;
@@ -531,7 +539,10 @@ struct kvm_vcpu_arch {
        u32 eptcfg;
        u32 epr;
        u32 crit_save;
-       struct kvmppc_booke_debug_reg dbg_reg;
+       /* guest debug registers*/
+       struct debug_reg dbg_reg;
+       /* hardware visible debug registers when in guest state */
+       struct debug_reg shadow_dbg_reg;
 #endif
        gpa_t paddr_accessed;
        gva_t vaddr_accessed;
@@ -582,7 +593,7 @@ struct kvm_vcpu_arch {
        struct kvmppc_icp *icp; /* XICS presentation controller */
 #endif
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        struct kvm_vcpu_arch_shared shregs;
 
        unsigned long pgfault_addr;
index b15554a..c8317fb 100644 (file)
@@ -106,13 +106,6 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
-
-extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                                  unsigned int op, int *advance);
-extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn,
-                                    ulong val);
-extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn,
-                                    ulong *val);
 extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_booke_init(void);
@@ -135,17 +128,17 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
                                struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                             unsigned long ioba, unsigned long tce);
-extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
-                               struct kvm_allocate_rma *rma);
 extern struct kvm_rma_info *kvm_alloc_rma(void);
 extern void kvm_release_rma(struct kvm_rma_info *ri);
 extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
 extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
-extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+extern void kvmppc_core_free_memslot(struct kvm *kvm,
+                                    struct kvm_memory_slot *free,
                                     struct kvm_memory_slot *dont);
-extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+extern int kvmppc_core_create_memslot(struct kvm *kvm,
+                                     struct kvm_memory_slot *slot,
                                      unsigned long npages);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
@@ -177,6 +170,72 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
 extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
 extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
 
+union kvmppc_one_reg {
+       u32     wval;
+       u64     dval;
+       vector128 vval;
+       u64     vsxval[2];
+       struct {
+               u64     addr;
+               u64     length;
+       }       vpaval;
+};
+
+struct kvmppc_ops {
+       struct module *owner;
+       int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+       int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+       int (*get_one_reg)(struct kvm_vcpu *vcpu, u64 id,
+                          union kvmppc_one_reg *val);
+       int (*set_one_reg)(struct kvm_vcpu *vcpu, u64 id,
+                          union kvmppc_one_reg *val);
+       void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+       void (*vcpu_put)(struct kvm_vcpu *vcpu);
+       void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr);
+       int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+       struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id);
+       void (*vcpu_free)(struct kvm_vcpu *vcpu);
+       int (*check_requests)(struct kvm_vcpu *vcpu);
+       int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log);
+       void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot);
+       int (*prepare_memory_region)(struct kvm *kvm,
+                                    struct kvm_memory_slot *memslot,
+                                    struct kvm_userspace_memory_region *mem);
+       void (*commit_memory_region)(struct kvm *kvm,
+                                    struct kvm_userspace_memory_region *mem,
+                                    const struct kvm_memory_slot *old);
+       int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
+       int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
+                          unsigned long end);
+       int (*age_hva)(struct kvm *kvm, unsigned long hva);
+       int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
+       void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
+       void (*mmu_destroy)(struct kvm_vcpu *vcpu);
+       void (*free_memslot)(struct kvm_memory_slot *free,
+                            struct kvm_memory_slot *dont);
+       int (*create_memslot)(struct kvm_memory_slot *slot,
+                             unsigned long npages);
+       int (*init_vm)(struct kvm *kvm);
+       void (*destroy_vm)(struct kvm *kvm);
+       int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info);
+       int (*emulate_op)(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                         unsigned int inst, int *advance);
+       int (*emulate_mtspr)(struct kvm_vcpu *vcpu, int sprn, ulong spr_val);
+       int (*emulate_mfspr)(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val);
+       void (*fast_vcpu_kick)(struct kvm_vcpu *vcpu);
+       long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl,
+                             unsigned long arg);
+
+};
+
+extern struct kvmppc_ops *kvmppc_hv_ops;
+extern struct kvmppc_ops *kvmppc_pr_ops;
+
+static inline bool is_kvmppc_hv_enabled(struct kvm *kvm)
+{
+       return kvm->arch.kvm_ops == kvmppc_hv_ops;
+}
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
@@ -210,17 +269,6 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
        return r;
 }
 
-union kvmppc_one_reg {
-       u32     wval;
-       u64     dval;
-       vector128 vval;
-       u64     vsxval[2];
-       struct {
-               u64     addr;
-               u64     length;
-       }       vpaval;
-};
-
 #define one_reg_size(id)       \
        (1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
@@ -245,10 +293,10 @@ union kvmppc_one_reg {
        __v;                                    \
 })
 
-void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
-void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
@@ -260,7 +308,7 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
 struct openpic;
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 extern void kvm_cma_reserve(void) __init;
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
@@ -269,10 +317,10 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 
 static inline u32 kvmppc_get_xics_latch(void)
 {
-       u32 xirr = get_paca()->kvm_hstate.saved_xirr;
+       u32 xirr;
 
+       xirr = get_paca()->kvm_hstate.saved_xirr;
        get_paca()->kvm_hstate.saved_xirr = 0;
-
        return xirr;
 }
 
@@ -281,7 +329,10 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
        paca[cpu].kvm_hstate.host_ipi = host_ipi;
 }
 
-extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
+static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+       vcpu->kvm->arch.kvm_ops->fast_vcpu_kick(vcpu);
+}
 
 #else
 static inline void __init kvm_cma_reserve(void)
index a5954ce..b6ea9e0 100644 (file)
@@ -166,7 +166,7 @@ struct paca_struct {
        struct dtl_entry *dtl_curr;     /* pointer corresponding to dtl_ridx */
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
-#ifdef CONFIG_KVM_BOOK3S_PR
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
        /* We use this to store guest state in */
        struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
 #endif
index f65e27b..16cb92d 100644 (file)
@@ -91,7 +91,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
        if (!pte)
                return NULL;
        page = virt_to_page(pte);
-       pgtable_page_ctor(page);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
        return page;
 }
 
index 7794b2b..fc14a38 100644 (file)
@@ -208,6 +208,7 @@ struct debug_reg {
 
 struct thread_struct {
        unsigned long   ksp;            /* Kernel stack pointer */
+
 #ifdef CONFIG_PPC64
        unsigned long   ksp_vsid;
 #endif
@@ -221,6 +222,7 @@ struct thread_struct {
        void            *pgdir;         /* root of page-table tree */
        unsigned long   ksp_limit;      /* if ksp <= ksp_limit stack overflow */
 #endif
+       /* Debug Registers */
        struct debug_reg debug;
        struct thread_fp_state  fp_state;
        struct thread_fp_state  *fp_save_area;
index 0156702..576ad88 100644 (file)
@@ -40,7 +40,7 @@
 #define _PAGE_U1       0x010000
 #define _PAGE_U0       0x020000
 #define _PAGE_ACCESSED 0x040000
-#define _PAGE_LENDIAN  0x080000
+#define _PAGE_ENDIAN   0x080000
 #define _PAGE_GUARDED  0x100000
 #define _PAGE_COHERENT 0x200000 /* M: enforce memory coherence */
 #define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */
index 126f6e9..5c45787 100644 (file)
 #define SPRN_TBRU      0x10D   /* Time Base Read Upper Register (user, R/O) */
 #define SPRN_TBWL      0x11C   /* Time Base Lower Register (super, R/W) */
 #define SPRN_TBWU      0x11D   /* Time Base Upper Register (super, R/W) */
+#define SPRN_TBU40     0x11E   /* Timebase upper 40 bits (hyper, R/W) */
 #define SPRN_SPURR     0x134   /* Scaled PURR */
 #define SPRN_HSPRG0    0x130   /* Hypervisor Scratch 0 */
 #define SPRN_HSPRG1    0x131   /* Hypervisor Scratch 1 */
 #define   LPCR_ISL     (1ul << (63-2))
 #define   LPCR_VC_SH   (63-2)
 #define   LPCR_DPFD_SH (63-11)
+#define   LPCR_DPFD    (7ul << LPCR_DPFD_SH)
 #define   LPCR_VRMASD  (0x1ful << (63-16))
 #define   LPCR_VRMA_L  (1ul << (63-12))
 #define   LPCR_VRMA_LP0        (1ul << (63-15))
 #define     LPCR_PECE2 0x00001000      /* machine check etc can cause exit */
 #define   LPCR_MER     0x00000800      /* Mediated External Exception */
 #define   LPCR_MER_SH  11
+#define   LPCR_TC      0x00000200      /* Translation control */
 #define   LPCR_LPES    0x0000000c
 #define   LPCR_LPES0   0x00000008      /* LPAR Env selector 0 */
 #define   LPCR_LPES1   0x00000004      /* LPAR Env selector 1 */
 #define   LPID_RSVD    0x3ff           /* Reserved LPID for partn switching */
 #define        SPRN_HMER       0x150   /* Hardware m? error recovery */
 #define        SPRN_HMEER      0x151   /* Hardware m? enable error recovery */
+#define SPRN_PCR       0x152   /* Processor compatibility register */
+#define   PCR_VEC_DIS  (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */
+#define   PCR_VSX_DIS  (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */
+#define   PCR_ARCH_205 0x2             /* Architecture 2.05 */
 #define        SPRN_HEIR       0x153   /* Hypervisor Emulated Instruction Register */
 #define SPRN_TLBINDEXR 0x154   /* P7 TLB control register */
 #define SPRN_TLBVPNR   0x155   /* P7 TLB control register */
 #define         HID4_RMLS2_SH   (63 - 2)       /* Real mode limit bottom 2 bits */
 #define         HID4_LPID5_SH   (63 - 6)       /* partition ID bottom 4 bits */
 #define         HID4_RMOR_SH    (63 - 22)      /* real mode offset (16 bits) */
+#define  HID4_RMOR      (0xFFFFul << HID4_RMOR_SH)
 #define  HID4_LPES1     (1 << (63-57)) /* LPAR env. sel. bit 1 */
 #define  HID4_RMLS0_SH  (63 - 58)      /* Real mode limit top bit */
 #define         HID4_LPID1_SH   0              /* partition ID top 2 bits */
 #define PVR_BE         0x0070
 #define PVR_PA6T       0x0090
 
+/* "Logical" PVR values defined in PAPR, representing architecture levels */
+#define PVR_ARCH_204   0x0f000001
+#define PVR_ARCH_205   0x0f000002
+#define PVR_ARCH_206   0x0f000003
+#define PVR_ARCH_206p  0x0f100003
+#define PVR_ARCH_207   0x0f000004
+
 /* Macros for setting and retrieving special purpose registers */
 #ifndef __ASSEMBLY__
 #define mfmsr()                ({unsigned long rval; \
index 0fb1a6e..6836ec7 100644 (file)
@@ -27,6 +27,7 @@
 #define __KVM_HAVE_PPC_SMT
 #define __KVM_HAVE_IRQCHIP
 #define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_GUEST_DEBUG
 
 struct kvm_regs {
        __u64 pc;
@@ -269,7 +270,24 @@ struct kvm_fpu {
        __u64 fpr[32];
 };
 
+/*
+ * Defines for h/w breakpoint, watchpoint (read, write or both) and
+ * software breakpoint.
+ * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status"
+ * for KVM_DEBUG_EXIT.
+ */
+#define KVMPPC_DEBUG_NONE              0x0
+#define KVMPPC_DEBUG_BREAKPOINT                (1UL << 1)
+#define KVMPPC_DEBUG_WATCH_WRITE       (1UL << 2)
+#define KVMPPC_DEBUG_WATCH_READ                (1UL << 3)
 struct kvm_debug_exit_arch {
+       __u64 address;
+       /*
+        * exiting to userspace because of h/w breakpoint, watchpoint
+        * (read, write or both) and software breakpoint.
+        */
+       __u32 status;
+       __u32 reserved;
 };
 
 /* for KVM_SET_GUEST_DEBUG */
@@ -281,10 +299,6 @@ struct kvm_guest_debug_arch {
                 * Type denotes h/w breakpoint, read watchpoint, write
                 * watchpoint or watchpoint (both read and write).
                 */
-#define KVMPPC_DEBUG_NONE              0x0
-#define KVMPPC_DEBUG_BREAKPOINT                (1UL << 1)
-#define KVMPPC_DEBUG_WATCH_WRITE       (1UL << 2)
-#define KVMPPC_DEBUG_WATCH_READ                (1UL << 3)
                __u32 type;
                __u32 reserved;
        } bp[16];
@@ -429,6 +443,11 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_MMCR0      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
 #define KVM_REG_PPC_MMCR1      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
 #define KVM_REG_PPC_MMCRA      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
+#define KVM_REG_PPC_MMCR2      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
+#define KVM_REG_PPC_MMCRS      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x14)
+#define KVM_REG_PPC_SIAR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x15)
+#define KVM_REG_PPC_SDAR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x16)
+#define KVM_REG_PPC_SIER       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x17)
 
 #define KVM_REG_PPC_PMC1       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18)
 #define KVM_REG_PPC_PMC2       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19)
@@ -499,6 +518,65 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_TLB3PS     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a)
 #define KVM_REG_PPC_EPTCFG     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b)
 
+/* Timebase offset */
+#define KVM_REG_PPC_TB_OFFSET  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9c)
+
+/* POWER8 registers */
+#define KVM_REG_PPC_SPMC1      (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9d)
+#define KVM_REG_PPC_SPMC2      (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9e)
+#define KVM_REG_PPC_IAMR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9f)
+#define KVM_REG_PPC_TFHAR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa0)
+#define KVM_REG_PPC_TFIAR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa1)
+#define KVM_REG_PPC_TEXASR     (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa2)
+#define KVM_REG_PPC_FSCR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa3)
+#define KVM_REG_PPC_PSPB       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xa4)
+#define KVM_REG_PPC_EBBHR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa5)
+#define KVM_REG_PPC_EBBRR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa6)
+#define KVM_REG_PPC_BESCR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7)
+#define KVM_REG_PPC_TAR                (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8)
+#define KVM_REG_PPC_DPDES      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9)
+#define KVM_REG_PPC_DAWR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
+#define KVM_REG_PPC_DAWRX      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
+#define KVM_REG_PPC_CIABR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac)
+#define KVM_REG_PPC_IC         (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad)
+#define KVM_REG_PPC_VTB                (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae)
+#define KVM_REG_PPC_CSIGR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaf)
+#define KVM_REG_PPC_TACR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb0)
+#define KVM_REG_PPC_TCSCR      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb1)
+#define KVM_REG_PPC_PID                (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb2)
+#define KVM_REG_PPC_ACOP       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb3)
+
+#define KVM_REG_PPC_VRSAVE     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb4)
+#define KVM_REG_PPC_LPCR       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb5)
+#define KVM_REG_PPC_PPR                (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb6)
+
+/* Architecture compatibility level */
+#define KVM_REG_PPC_ARCH_COMPAT        (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb7)
+
+/* Transactional Memory checkpointed state:
+ * This is all GPRs, all VSX regs and a subset of SPRs
+ */
+#define KVM_REG_PPC_TM         (KVM_REG_PPC | 0x80000000)
+/* TM GPRs */
+#define KVM_REG_PPC_TM_GPR0    (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0)
+#define KVM_REG_PPC_TM_GPR(n)  (KVM_REG_PPC_TM_GPR0 + (n))
+#define KVM_REG_PPC_TM_GPR31   (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x1f)
+/* TM VSX */
+#define KVM_REG_PPC_TM_VSR0    (KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x20)
+#define KVM_REG_PPC_TM_VSR(n)  (KVM_REG_PPC_TM_VSR0 + (n))
+#define KVM_REG_PPC_TM_VSR63   (KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x5f)
+/* TM SPRS */
+#define KVM_REG_PPC_TM_CR      (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x60)
+#define KVM_REG_PPC_TM_LR      (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x61)
+#define KVM_REG_PPC_TM_CTR     (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x62)
+#define KVM_REG_PPC_TM_FPSCR   (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x63)
+#define KVM_REG_PPC_TM_AMR     (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x64)
+#define KVM_REG_PPC_TM_PPR     (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x65)
+#define KVM_REG_PPC_TM_VRSAVE  (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x66)
+#define KVM_REG_PPC_TM_VSCR    (KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67)
+#define KVM_REG_PPC_TM_DSCR    (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68)
+#define KVM_REG_PPC_TM_TAR     (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69)
+
 /* PPC64 eXternal Interrupt Controller Specification */
 #define KVM_DEV_XICS_GRP_SOURCES       1       /* 64-bit source attributes */
 
index e60a369..2ea5cc0 100644 (file)
@@ -439,7 +439,7 @@ int main(void)
        DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
        DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
        DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
        DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
@@ -470,7 +470,7 @@ int main(void)
        DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
 
        /* book3s */
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
        DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
        DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
@@ -502,6 +502,8 @@ int main(void)
        DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
        DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
        DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+       DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar));
+       DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar));
        DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
        DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
        DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
@@ -511,18 +513,22 @@ int main(void)
        DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
        DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
        DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
+       DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
+       DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
        DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
        DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
        DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
        DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
-       DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
-                          offsetof(struct kvmppc_vcpu_book3s, vcpu));
+       DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset));
+       DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
+       DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
        DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
        DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
        DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
 
 #ifdef CONFIG_PPC_BOOK3S_64
-#ifdef CONFIG_KVM_BOOK3S_PR
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+       DEFINE(PACA_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
 # define SVCPU_FIELD(x, f)     DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
 #else
 # define SVCPU_FIELD(x, f)
@@ -574,7 +580,7 @@ int main(void)
        HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
        HSTATE_FIELD(HSTATE_NAPPING, napping);
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req);
        HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state);
        HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
@@ -590,10 +596,11 @@ int main(void)
        HSTATE_FIELD(HSTATE_DABR, dabr);
        HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
        DEFINE(IPI_PRIORITY, IPI_PRIORITY);
-#endif /* CONFIG_KVM_BOOK3S_64_HV */
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
        HSTATE_FIELD(HSTATE_CFAR, cfar);
+       HSTATE_FIELD(HSTATE_PPR, ppr);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
index 3a9ed6a..9f905e4 100644 (file)
@@ -126,7 +126,7 @@ BEGIN_FTR_SECTION
        bgt     cr1,.
        GET_PACA(r13)
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        li      r0,KVM_HWTHREAD_IN_KERNEL
        stb     r0,HSTATE_HWTHREAD_STATE(r13)
        /* Order setting hwthread_state vs. testing hwthread_req */
@@ -425,7 +425,7 @@ data_access_check_stab:
        mfspr   r9,SPRN_DSISR
        srdi    r10,r10,60
        rlwimi  r10,r9,16,0x20
-#ifdef CONFIG_KVM_BOOK3S_PR
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
        lbz     r9,HSTATE_IN_GUEST(r13)
        rlwimi  r10,r9,8,0x300
 #endif
@@ -650,6 +650,32 @@ slb_miss_user_pseries:
        b       .                               /* prevent spec. execution */
 #endif /* __DISABLED__ */
 
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+kvmppc_skip_interrupt:
+       /*
+        * Here all GPRs are unchanged from when the interrupt happened
+        * except for r13, which is saved in SPRG_SCRATCH0.
+        */
+       mfspr   r13, SPRN_SRR0
+       addi    r13, r13, 4
+       mtspr   SPRN_SRR0, r13
+       GET_SCRATCH0(r13)
+       rfid
+       b       .
+
+kvmppc_skip_Hinterrupt:
+       /*
+        * Here all GPRs are unchanged from when the interrupt happened
+        * except for r13, which is saved in SPRG_SCRATCH0.
+        */
+       mfspr   r13, SPRN_HSRR0
+       addi    r13, r13, 4
+       mtspr   SPRN_HSRR0, r13
+       GET_SCRATCH0(r13)
+       hrfid
+       b       .
+#endif
+
 /*
  * Code from here down to __end_handlers is invoked from the
  * exception prologs above.  Because the prologs assemble the
index e11863f..847e40e 100644 (file)
@@ -84,7 +84,7 @@ _GLOBAL(power7_nap)
        std     r9,_MSR(r1)
        std     r1,PACAR1(r13)
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        /* Tell KVM we're napping */
        li      r4,KVM_HWTHREAD_IN_NAP
        stb     r4,HSTATE_HWTHREAD_STATE(r13)
index 62c3dd8..907a472 100644 (file)
@@ -1529,7 +1529,7 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
         * back on or not.
         */
        if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0,
-           current->thread.debug.dbcr1))
+                              current->thread.debug.dbcr1))
                regs->msr |= MSR_DE;
        else
                /* Make sure the IDM flag is off */
index 2f5c6b6..93221e8 100644 (file)
 #include "44x_tlb.h"
 #include "booke.h"
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void kvmppc_core_vcpu_load_44x(struct kvm_vcpu *vcpu, int cpu)
 {
        kvmppc_booke_vcpu_load(vcpu, cpu);
        kvmppc_44x_tlb_load(vcpu);
 }
 
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_put_44x(struct kvm_vcpu *vcpu)
 {
        kvmppc_44x_tlb_put(vcpu);
        kvmppc_booke_vcpu_put(vcpu);
@@ -114,29 +114,32 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvmppc_core_get_sregs_44x(struct kvm_vcpu *vcpu,
+                                     struct kvm_sregs *sregs)
 {
-       kvmppc_get_sregs_ivor(vcpu, sregs);
+       return kvmppc_get_sregs_ivor(vcpu, sregs);
 }
 
-int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvmppc_core_set_sregs_44x(struct kvm_vcpu *vcpu,
+                                    struct kvm_sregs *sregs)
 {
        return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
-int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
-                       union kvmppc_one_reg *val)
+static int kvmppc_get_one_reg_44x(struct kvm_vcpu *vcpu, u64 id,
+                                 union kvmppc_one_reg *val)
 {
        return -EINVAL;
 }
 
-int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
-                      union kvmppc_one_reg *val)
+static int kvmppc_set_one_reg_44x(struct kvm_vcpu *vcpu, u64 id,
+                                 union kvmppc_one_reg *val)
 {
        return -EINVAL;
 }
 
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+static struct kvm_vcpu *kvmppc_core_vcpu_create_44x(struct kvm *kvm,
+                                                   unsigned int id)
 {
        struct kvmppc_vcpu_44x *vcpu_44x;
        struct kvm_vcpu *vcpu;
@@ -167,7 +170,7 @@ out:
        return ERR_PTR(err);
 }
 
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_free_44x(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 
@@ -176,28 +179,53 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
        kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 }
 
-int kvmppc_core_init_vm(struct kvm *kvm)
+static int kvmppc_core_init_vm_44x(struct kvm *kvm)
 {
        return 0;
 }
 
-void kvmppc_core_destroy_vm(struct kvm *kvm)
+static void kvmppc_core_destroy_vm_44x(struct kvm *kvm)
 {
 }
 
+static struct kvmppc_ops kvm_ops_44x = {
+       .get_sregs = kvmppc_core_get_sregs_44x,
+       .set_sregs = kvmppc_core_set_sregs_44x,
+       .get_one_reg = kvmppc_get_one_reg_44x,
+       .set_one_reg = kvmppc_set_one_reg_44x,
+       .vcpu_load   = kvmppc_core_vcpu_load_44x,
+       .vcpu_put    = kvmppc_core_vcpu_put_44x,
+       .vcpu_create = kvmppc_core_vcpu_create_44x,
+       .vcpu_free   = kvmppc_core_vcpu_free_44x,
+       .mmu_destroy  = kvmppc_mmu_destroy_44x,
+       .init_vm = kvmppc_core_init_vm_44x,
+       .destroy_vm = kvmppc_core_destroy_vm_44x,
+       .emulate_op = kvmppc_core_emulate_op_44x,
+       .emulate_mtspr = kvmppc_core_emulate_mtspr_44x,
+       .emulate_mfspr = kvmppc_core_emulate_mfspr_44x,
+};
+
 static int __init kvmppc_44x_init(void)
 {
        int r;
 
        r = kvmppc_booke_init();
        if (r)
-               return r;
+               goto err_out;
+
+       r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE);
+       if (r)
+               goto err_out;
+       kvm_ops_44x.owner = THIS_MODULE;
+       kvmppc_pr_ops = &kvm_ops_44x;
 
-       return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE);
+err_out:
+       return r;
 }
 
 static void __exit kvmppc_44x_exit(void)
 {
+       kvmppc_pr_ops = NULL;
        kvmppc_booke_exit();
 }
 
index 35ec0a8..92c9ab4 100644 (file)
@@ -91,8 +91,8 @@ static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn)
        return EMULATE_DONE;
 }
 
-int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                           unsigned int inst, int *advance)
+int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                              unsigned int inst, int *advance)
 {
        int emulated = EMULATE_DONE;
        int dcrn = get_dcrn(inst);
@@ -152,7 +152,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
        return emulated;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
        int emulated = EMULATE_DONE;
 
@@ -172,7 +172,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
        return emulated;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
        int emulated = EMULATE_DONE;
 
index ed03854..0deef10 100644 (file)
@@ -268,7 +268,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,
        trace_kvm_stlb_inval(stlb_index);
 }
 
-void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
        int i;
index e593ff2..141b202 100644 (file)
@@ -35,17 +35,20 @@ config KVM_BOOK3S_64_HANDLER
        bool
        select KVM_BOOK3S_HANDLER
 
-config KVM_BOOK3S_PR
+config KVM_BOOK3S_PR_POSSIBLE
        bool
        select KVM_MMIO
        select MMU_NOTIFIER
 
+config KVM_BOOK3S_HV_POSSIBLE
+       bool
+
 config KVM_BOOK3S_32
        tristate "KVM support for PowerPC book3s_32 processors"
        depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT
        select KVM
        select KVM_BOOK3S_32_HANDLER
-       select KVM_BOOK3S_PR
+       select KVM_BOOK3S_PR_POSSIBLE
        ---help---
          Support running unmodified book3s_32 guest kernels
          in virtual machines on book3s_32 host processors.
@@ -60,6 +63,7 @@ config KVM_BOOK3S_64
        depends on PPC_BOOK3S_64
        select KVM_BOOK3S_64_HANDLER
        select KVM
+       select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
        ---help---
          Support running unmodified book3s_64 and book3s_32 guest kernels
          in virtual machines on book3s_64 host processors.
@@ -70,8 +74,9 @@ config KVM_BOOK3S_64
          If unsure, say N.
 
 config KVM_BOOK3S_64_HV
-       bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+       tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
        depends on KVM_BOOK3S_64
+       select KVM_BOOK3S_HV_POSSIBLE
        select MMU_NOTIFIER
        select CMA
        ---help---
@@ -90,9 +95,20 @@ config KVM_BOOK3S_64_HV
          If unsure, say N.
 
 config KVM_BOOK3S_64_PR
-       def_bool y
-       depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
-       select KVM_BOOK3S_PR
+       tristate "KVM support without using hypervisor mode in host"
+       depends on KVM_BOOK3S_64
+       select KVM_BOOK3S_PR_POSSIBLE
+       ---help---
+         Support running guest kernels in virtual machines on processors
+         without using hypervisor mode in the host, by running the
+         guest in user mode (problem state) and emulating all
+         privileged instructions and registers.
+
+         This is not as fast as using hypervisor mode, but works on
+         machines where hypervisor mode is not available or not usable,
+         and can emulate processors that are different from the host
+         processor, including emulating 32-bit processors on a 64-bit
+         host.
 
 config KVM_BOOKE_HV
        bool
index 6646c95..ce569b6 100644 (file)
@@ -53,41 +53,51 @@ kvm-e500mc-objs := \
        e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
 
-kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
-       $(KVM)/coalesced_mmio.o \
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \
+       book3s_64_vio_hv.o
+
+kvm-pr-y := \
        fpu.o \
        book3s_paired_singles.o \
        book3s_pr.o \
        book3s_pr_papr.o \
-       book3s_64_vio_hv.o \
        book3s_emulate.o \
        book3s_interrupts.o \
        book3s_mmu_hpte.o \
        book3s_64_mmu_host.o \
        book3s_64_mmu.o \
        book3s_32_mmu.o
-kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
+
+ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+kvm-book3s_64-module-objs := \
+       $(KVM)/coalesced_mmio.o
+
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
        book3s_rmhandlers.o
+endif
 
-kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+kvm-hv-y += \
        book3s_hv.o \
        book3s_hv_interrupts.o \
        book3s_64_mmu_hv.o
+
 kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
        book3s_hv_rm_xics.o
-kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+
+ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
        book3s_hv_rmhandlers.o \
        book3s_hv_rm_mmu.o \
-       book3s_64_vio_hv.o \
        book3s_hv_ras.o \
        book3s_hv_builtin.o \
        book3s_hv_cma.o \
        $(kvm-book3s_64-builtin-xics-objs-y)
+endif
 
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
        book3s_xics.o
 
-kvm-book3s_64-module-objs := \
+kvm-book3s_64-module-objs += \
        $(KVM)/kvm_main.o \
        $(KVM)/eventfd.o \
        powerpc.o \
@@ -123,4 +133,7 @@ obj-$(CONFIG_KVM_E500MC) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
 
+obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o
+obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o
+
 obj-y += $(kvm-book3s_64-builtin-objs-y)
index 700df6f..8912608 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+#include "book3s.h"
 #include "trace.h"
 
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -69,6 +70,50 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 {
 }
 
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+       if (!is_kvmppc_hv_enabled(vcpu->kvm))
+               return to_book3s(vcpu)->hior;
+       return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+                       unsigned long pending_now, unsigned long old_pending)
+{
+       if (is_kvmppc_hv_enabled(vcpu->kvm))
+               return;
+       if (pending_now)
+               vcpu->arch.shared->int_pending = 1;
+       else if (old_pending)
+               vcpu->arch.shared->int_pending = 0;
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+       ulong crit_raw;
+       ulong crit_r1;
+       bool crit;
+
+       if (is_kvmppc_hv_enabled(vcpu->kvm))
+               return false;
+
+       crit_raw = vcpu->arch.shared->critical;
+       crit_r1 = kvmppc_get_gpr(vcpu, 1);
+
+       /* Truncate crit indicators in 32 bit mode */
+       if (!(vcpu->arch.shared->msr & MSR_SF)) {
+               crit_raw &= 0xffffffff;
+               crit_r1 &= 0xffffffff;
+       }
+
+       /* Critical section when crit == r1 */
+       crit = (crit_raw == crit_r1);
+       /* ... and we're in supervisor mode */
+       crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+
+       return crit;
+}
+
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
        vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
@@ -126,28 +171,32 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
        printk(KERN_INFO "Queueing interrupt %x\n", vec);
 #endif
 }
-
+EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio);
 
 void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 {
        /* might as well deliver this straight away */
        kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags);
 }
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_program);
 
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
        kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
 }
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec);
 
 int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
 {
        return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
 }
+EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec);
 
 void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
 {
        kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
 }
+EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
 
 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                 struct kvm_interrupt *irq)
@@ -285,8 +334,10 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter);
 
-pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing,
+                       bool *writable)
 {
        ulong mp_pa = vcpu->arch.magic_page_pa;
 
@@ -302,20 +353,23 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 
                pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
                get_page(pfn_to_page(pfn));
+               if (writable)
+                       *writable = true;
                return pfn;
        }
 
-       return gfn_to_pfn(vcpu->kvm, gfn);
+       return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable);
 }
+EXPORT_SYMBOL_GPL(kvmppc_gfn_to_pfn);
 
 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
-                        struct kvmppc_pte *pte)
+                       bool iswrite, struct kvmppc_pte *pte)
 {
        int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR));
        int r;
 
        if (relocated) {
-               r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data);
+               r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite);
        } else {
                pte->eaddr = eaddr;
                pte->raddr = eaddr & KVM_PAM;
@@ -361,7 +415,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
 
        vcpu->stat.st++;
 
-       if (kvmppc_xlate(vcpu, *eaddr, data, &pte))
+       if (kvmppc_xlate(vcpu, *eaddr, data, true, &pte))
                return -ENOENT;
 
        *eaddr = pte.raddr;
@@ -374,6 +428,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
 
        return EMULATE_DONE;
 }
+EXPORT_SYMBOL_GPL(kvmppc_st);
 
 int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
                      bool data)
@@ -383,7 +438,7 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
 
        vcpu->stat.ld++;
 
-       if (kvmppc_xlate(vcpu, *eaddr, data, &pte))
+       if (kvmppc_xlate(vcpu, *eaddr, data, false, &pte))
                goto nopte;
 
        *eaddr = pte.raddr;
@@ -404,6 +459,7 @@ nopte:
 mmio:
        return EMULATE_DO_MMIO;
 }
+EXPORT_SYMBOL_GPL(kvmppc_ld);
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
@@ -419,6 +475,18 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 }
 
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs)
+{
+       return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs)
+{
+       return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+}
+
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        int i;
@@ -495,8 +563,7 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
        if (size > sizeof(val))
                return -EINVAL;
 
-       r = kvmppc_get_one_reg(vcpu, reg->id, &val);
-
+       r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val);
        if (r == -EINVAL) {
                r = 0;
                switch (reg->id) {
@@ -528,6 +595,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                        }
                        val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
                        break;
+               case KVM_REG_PPC_VRSAVE:
+                       val = get_reg_val(reg->id, vcpu->arch.vrsave);
+                       break;
 #endif /* CONFIG_ALTIVEC */
                case KVM_REG_PPC_DEBUG_INST: {
                        u32 opcode = INS_TW;
@@ -572,8 +642,7 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
        if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
                return -EFAULT;
 
-       r = kvmppc_set_one_reg(vcpu, reg->id, &val);
-
+       r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val);
        if (r == -EINVAL) {
                r = 0;
                switch (reg->id) {
@@ -605,6 +674,13 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                        }
                        vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
                        break;
+               case KVM_REG_PPC_VRSAVE:
+                       if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+                               r = -ENXIO;
+                               break;
+                       }
+                       vcpu->arch.vrsave = set_reg_val(reg->id, val);
+                       break;
 #endif /* CONFIG_ALTIVEC */
 #ifdef CONFIG_KVM_XICS
                case KVM_REG_PPC_ICP_STATE:
@@ -625,6 +701,27 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
        return r;
 }
 
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+       vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr);
+}
+EXPORT_SYMBOL_GPL(kvmppc_set_msr);
+
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu);
+}
+
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
                                   struct kvm_translation *tr)
 {
@@ -644,3 +741,141 @@ void kvmppc_decrementer_func(unsigned long data)
        kvmppc_core_queue_dec(vcpu);
        kvm_vcpu_kick(vcpu);
 }
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+       return kvm->arch.kvm_ops->vcpu_create(kvm, id);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+       vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
+}
+
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
+{
+       return vcpu->kvm->arch.kvm_ops->check_requests(vcpu);
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+       return kvm->arch.kvm_ops->get_dirty_log(kvm, log);
+}
+
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
+                             struct kvm_memory_slot *dont)
+{
+       kvm->arch.kvm_ops->free_memslot(free, dont);
+}
+
+int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                              unsigned long npages)
+{
+       return kvm->arch.kvm_ops->create_memslot(slot, npages);
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+       kvm->arch.kvm_ops->flush_memslot(kvm, memslot);
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+                               struct kvm_memory_slot *memslot,
+                               struct kvm_userspace_memory_region *mem)
+{
+       return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem);
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_memory_slot *old)
+{
+       kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old);
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+       return kvm->arch.kvm_ops->unmap_hva(kvm, hva);
+}
+EXPORT_SYMBOL_GPL(kvm_unmap_hva);
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+       return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       return kvm->arch.kvm_ops->age_hva(kvm, hva);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+       kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+       vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+
+#ifdef CONFIG_PPC64
+       INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+       INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
+#endif
+
+       return kvm->arch.kvm_ops->init_vm(kvm);
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+       kvm->arch.kvm_ops->destroy_vm(kvm);
+
+#ifdef CONFIG_PPC64
+       kvmppc_rtas_tokens_free(kvm);
+       WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
+#endif
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+       /*
+        * We always return 0 for book3s. We check
+        * for compatability while loading the HV
+        * or PR module
+        */
+       return 0;
+}
+
+static int kvmppc_book3s_init(void)
+{
+       int r;
+
+       r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+       if (r)
+               return r;
+#ifdef CONFIG_KVM_BOOK3S_32
+       r = kvmppc_book3s_init_pr();
+#endif
+       return r;
+
+}
+
+static void kvmppc_book3s_exit(void)
+{
+#ifdef CONFIG_KVM_BOOK3S_32
+       kvmppc_book3s_exit_pr();
+#endif
+       kvm_exit();
+}
+
+module_init(kvmppc_book3s_init);
+module_exit(kvmppc_book3s_exit);
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
new file mode 100644 (file)
index 0000000..4bf956c
--- /dev/null
@@ -0,0 +1,34 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+
+#ifndef __POWERPC_KVM_BOOK3S_H__
+#define __POWERPC_KVM_BOOK3S_H__
+
+extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
+                                        struct kvm_memory_slot *memslot);
+extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
+                                 unsigned long end);
+extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva);
+extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva);
+extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte);
+
+extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                    unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu,
+                                       int sprn, ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu,
+                                       int sprn, ulong *spr_val);
+extern int kvmppc_book3s_init_pr(void);
+extern void kvmppc_book3s_exit_pr(void);
+
+#endif
index c8cefdd..76a64ce 100644 (file)
@@ -84,7 +84,8 @@ static inline bool sr_nx(u32 sr_raw)
 }
 
 static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
-                                         struct kvmppc_pte *pte, bool data);
+                                         struct kvmppc_pte *pte, bool data,
+                                         bool iswrite);
 static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
                                             u64 *vsid);
 
@@ -99,7 +100,7 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
        u64 vsid;
        struct kvmppc_pte pte;
 
-       if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data))
+       if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false))
                return pte.vpage;
 
        kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
@@ -111,10 +112,11 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)
        kvmppc_set_msr(vcpu, 0);
 }
 
-static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s,
+static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu,
                                      u32 sre, gva_t eaddr,
                                      bool primary)
 {
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        u32 page, hash, pteg, htabmask;
        hva_t r;
 
@@ -132,7 +134,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
                kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg,
                sr_vsid(sre));
 
-       r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
+       r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT);
        if (kvm_is_error_hva(r))
                return r;
        return r | (pteg & ~PAGE_MASK);
@@ -145,7 +147,8 @@ static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary)
 }
 
 static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
-                                         struct kvmppc_pte *pte, bool data)
+                                         struct kvmppc_pte *pte, bool data,
+                                         bool iswrite)
 {
        struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        struct kvmppc_bat *bat;
@@ -186,8 +189,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
                                printk(KERN_INFO "BAT is not readable!\n");
                                continue;
                        }
-                       if (!pte->may_write) {
-                               /* let's treat r/o BATs as not-readable for now */
+                       if (iswrite && !pte->may_write) {
                                dprintk_pte("BAT is read-only!\n");
                                continue;
                        }
@@ -201,9 +203,8 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
                                     struct kvmppc_pte *pte, bool data,
-                                    bool primary)
+                                    bool iswrite, bool primary)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        u32 sre;
        hva_t ptegp;
        u32 pteg[16];
@@ -218,7 +219,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
 
        pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
 
-       ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu_book3s, sre, eaddr, primary);
+       ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary);
        if (kvm_is_error_hva(ptegp)) {
                printk(KERN_INFO "KVM: Invalid PTEG!\n");
                goto no_page_found;
@@ -258,9 +259,6 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
                                        break;
                        }
 
-                       if ( !pte->may_read )
-                               continue;
-
                        dprintk_pte("MMU: Found PTE -> %x %x - %x\n",
                                    pteg[i], pteg[i+1], pp);
                        found = 1;
@@ -271,19 +269,23 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
        /* Update PTE C and A bits, so the guest's swapper knows we used the
           page */
        if (found) {
-               u32 oldpte = pteg[i+1];
-
-               if (pte->may_read)
-                       pteg[i+1] |= PTEG_FLAG_ACCESSED;
-               if (pte->may_write)
-                       pteg[i+1] |= PTEG_FLAG_DIRTY;
-               else
-                       dprintk_pte("KVM: Mapping read-only page!\n");
-
-               /* Write back into the PTEG */
-               if (pteg[i+1] != oldpte)
-                       copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
-
+               u32 pte_r = pteg[i+1];
+               char __user *addr = (char __user *) &pteg[i+1];
+
+               /*
+                * Use single-byte writes to update the HPTE, to
+                * conform to what real hardware does.
+                */
+               if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) {
+                       pte_r |= PTEG_FLAG_ACCESSED;
+                       put_user(pte_r >> 8, addr + 2);
+               }
+               if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) {
+                       pte_r |= PTEG_FLAG_DIRTY;
+                       put_user(pte_r, addr + 3);
+               }
+               if (!pte->may_read || (iswrite && !pte->may_write))
+                       return -EPERM;
                return 0;
        }
 
@@ -302,12 +304,14 @@ no_page_found:
 }
 
 static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                                     struct kvmppc_pte *pte, bool data)
+                                     struct kvmppc_pte *pte, bool data,
+                                     bool iswrite)
 {
        int r;
        ulong mp_ea = vcpu->arch.magic_page_ea;
 
        pte->eaddr = eaddr;
+       pte->page_size = MMU_PAGE_4K;
 
        /* Magic page override */
        if (unlikely(mp_ea) &&
@@ -323,11 +327,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                return 0;
        }
 
-       r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data);
+       r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite);
        if (r < 0)
-              r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true);
+               r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte,
+                                                  data, iswrite, true);
        if (r < 0)
-              r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, false);
+               r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte,
+                                                  data, iswrite, false);
 
        return r;
 }
@@ -347,7 +353,12 @@ static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
 
 static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large)
 {
-       kvmppc_mmu_pte_flush(vcpu, ea, 0x0FFFF000);
+       int i;
+       struct kvm_vcpu *v;
+
+       /* flush this VA on all cpus */
+       kvm_for_each_vcpu(i, v, vcpu->kvm)
+               kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000);
 }
 
 static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
index 00e619b..3a0abd2 100644 (file)
@@ -138,7 +138,8 @@ static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr,
 
 extern char etext[];
 
-int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
+                       bool iswrite)
 {
        pfn_t hpaddr;
        u64 vpn;
@@ -152,9 +153,11 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
        bool evict = false;
        struct hpte_cache *pte;
        int r = 0;
+       bool writable;
 
        /* Get host physical address for gpa */
-       hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
+       hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT,
+                                  iswrite, &writable);
        if (is_error_noslot_pfn(hpaddr)) {
                printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
                                 orig_pte->eaddr);
@@ -204,7 +207,7 @@ next_pteg:
                (primary ? 0 : PTE_SEC);
        pteg1 = hpaddr | PTE_M | PTE_R | PTE_C;
 
-       if (orig_pte->may_write) {
+       if (orig_pte->may_write && writable) {
                pteg1 |= PP_RWRW;
                mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
        } else {
@@ -259,6 +262,11 @@ out:
        return r;
 }
 
+void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+       kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL);
+}
+
 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 {
        struct kvmppc_sid_map *map;
@@ -341,7 +349,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
        svcpu_put(svcpu);
 }
 
-void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu)
 {
        int i;
 
index 7e345e0..83da1f8 100644 (file)
@@ -107,9 +107,20 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
        return kvmppc_slb_calc_vpn(slb, eaddr);
 }
 
+static int mmu_pagesize(int mmu_pg)
+{
+       switch (mmu_pg) {
+       case MMU_PAGE_64K:
+               return 16;
+       case MMU_PAGE_16M:
+               return 24;
+       }
+       return 12;
+}
+
 static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
 {
-       return slbe->large ? 24 : 12;
+       return mmu_pagesize(slbe->base_page_size);
 }
 
 static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)
@@ -119,11 +130,11 @@ static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)
        return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p);
 }
 
-static hva_t kvmppc_mmu_book3s_64_get_pteg(
-                               struct kvmppc_vcpu_book3s *vcpu_book3s,
+static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu,
                                struct kvmppc_slb *slbe, gva_t eaddr,
                                bool second)
 {
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        u64 hash, pteg, htabsize;
        u32 ssize;
        hva_t r;
@@ -148,10 +159,10 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg(
 
        /* When running a PAPR guest, SDR1 contains a HVA address instead
            of a GPA */
-       if (vcpu_book3s->vcpu.arch.papr_enabled)
+       if (vcpu->arch.papr_enabled)
                r = pteg;
        else
-               r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
+               r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT);
 
        if (kvm_is_error_hva(r))
                return r;
@@ -166,18 +177,38 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr)
        avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
        avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p);
 
-       if (p < 24)
-               avpn >>= ((80 - p) - 56) - 8;
+       if (p < 16)
+               avpn >>= ((80 - p) - 56) - 8;   /* 16 - p */
        else
-               avpn <<= 8;
+               avpn <<= p - 16;
 
        return avpn;
 }
 
+/*
+ * Return page size encoded in the second word of a HPTE, or
+ * -1 for an invalid encoding for the base page size indicated by
+ * the SLB entry.  This doesn't handle mixed pagesize segments yet.
+ */
+static int decode_pagesize(struct kvmppc_slb *slbe, u64 r)
+{
+       switch (slbe->base_page_size) {
+       case MMU_PAGE_64K:
+               if ((r & 0xf000) == 0x1000)
+                       return MMU_PAGE_64K;
+               break;
+       case MMU_PAGE_16M:
+               if ((r & 0xff000) == 0)
+                       return MMU_PAGE_16M;
+               break;
+       }
+       return -1;
+}
+
 static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                               struct kvmppc_pte *gpte, bool data)
+                                     struct kvmppc_pte *gpte, bool data,
+                                     bool iswrite)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        struct kvmppc_slb *slbe;
        hva_t ptegp;
        u64 pteg[16];
@@ -189,6 +220,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        u8 pp, key = 0;
        bool found = false;
        bool second = false;
+       int pgsize;
        ulong mp_ea = vcpu->arch.magic_page_ea;
 
        /* Magic page override */
@@ -202,6 +234,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                gpte->may_execute = true;
                gpte->may_read = true;
                gpte->may_write = true;
+               gpte->page_size = MMU_PAGE_4K;
 
                return 0;
        }
@@ -222,8 +255,12 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID |
                HPTE_V_SECONDARY;
 
+       pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K;
+
+       mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+
 do_second:
-       ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
+       ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second);
        if (kvm_is_error_hva(ptegp))
                goto no_page_found;
 
@@ -240,6 +277,13 @@ do_second:
        for (i=0; i<16; i+=2) {
                /* Check all relevant fields of 1st dword */
                if ((pteg[i] & v_mask) == v_val) {
+                       /* If large page bit is set, check pgsize encoding */
+                       if (slbe->large &&
+                           (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) {
+                               pgsize = decode_pagesize(slbe, pteg[i+1]);
+                               if (pgsize < 0)
+                                       continue;
+                       }
                        found = true;
                        break;
                }
@@ -256,13 +300,15 @@ do_second:
        v = pteg[i];
        r = pteg[i+1];
        pp = (r & HPTE_R_PP) | key;
-       eaddr_mask = 0xFFF;
+       if (r & HPTE_R_PP0)
+               pp |= 8;
 
        gpte->eaddr = eaddr;
        gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
-       if (slbe->large)
-               eaddr_mask = 0xFFFFFF;
+
+       eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1;
        gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
+       gpte->page_size = pgsize;
        gpte->may_execute = ((r & HPTE_R_N) ? false : true);
        gpte->may_read = false;
        gpte->may_write = false;
@@ -277,6 +323,7 @@ do_second:
        case 3:
        case 5:
        case 7:
+       case 10:
                gpte->may_read = true;
                break;
        }
@@ -287,30 +334,37 @@ do_second:
 
        /* Update PTE R and C bits, so the guest's swapper knows we used the
         * page */
-       if (gpte->may_read) {
-               /* Set the accessed flag */
+       if (gpte->may_read && !(r & HPTE_R_R)) {
+               /*
+                * Set the accessed flag.
+                * We have to write this back with a single byte write
+                * because another vcpu may be accessing this on
+                * non-PAPR platforms such as mac99, and this is
+                * what real hardware does.
+                */
+               char __user *addr = (char __user *) &pteg[i+1];
                r |= HPTE_R_R;
+               put_user(r >> 8, addr + 6);
        }
-       if (data && gpte->may_write) {
-               /* Set the dirty flag -- XXX even if not writing */
+       if (iswrite && gpte->may_write && !(r & HPTE_R_C)) {
+               /* Set the dirty flag */
+               /* Use a single byte write */
+               char __user *addr = (char __user *) &pteg[i+1];
                r |= HPTE_R_C;
+               put_user(r, addr + 7);
        }
 
-       /* Write back into the PTEG */
-       if (pteg[i+1] != r) {
-               pteg[i+1] = r;
-               copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
-       }
+       mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
 
-       if (!gpte->may_read)
+       if (!gpte->may_read || (iswrite && !gpte->may_write))
                return -EPERM;
        return 0;
 
 no_page_found:
+       mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
        return -ENOENT;
 
 no_seg_found:
-
        dprintk("KVM MMU: Trigger segment fault\n");
        return -EINVAL;
 }
@@ -345,6 +399,21 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
        slbe->nx    = (rs & SLB_VSID_N) ? 1 : 0;
        slbe->class = (rs & SLB_VSID_C) ? 1 : 0;
 
+       slbe->base_page_size = MMU_PAGE_4K;
+       if (slbe->large) {
+               if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) {
+                       switch (rs & SLB_VSID_LP) {
+                       case SLB_VSID_LP_00:
+                               slbe->base_page_size = MMU_PAGE_16M;
+                               break;
+                       case SLB_VSID_LP_01:
+                               slbe->base_page_size = MMU_PAGE_64K;
+                               break;
+                       }
+               } else
+                       slbe->base_page_size = MMU_PAGE_16M;
+       }
+
        slbe->orige = rb & (ESID_MASK | SLB_ESID_V);
        slbe->origv = rs;
 
@@ -460,14 +529,45 @@ static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va,
                                       bool large)
 {
        u64 mask = 0xFFFFFFFFFULL;
+       long i;
+       struct kvm_vcpu *v;
 
        dprintk("KVM MMU: tlbie(0x%lx)\n", va);
 
-       if (large)
-               mask = 0xFFFFFF000ULL;
-       kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask);
+       /*
+        * The tlbie instruction changed behaviour starting with
+        * POWER6.  POWER6 and later don't have the large page flag
+        * in the instruction but in the RB value, along with bits
+        * indicating page and segment sizes.
+        */
+       if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) {
+               /* POWER6 or later */
+               if (va & 1) {           /* L bit */
+                       if ((va & 0xf000) == 0x1000)
+                               mask = 0xFFFFFFFF0ULL;  /* 64k page */
+                       else
+                               mask = 0xFFFFFF000ULL;  /* 16M page */
+               }
+       } else {
+               /* older processors, e.g. PPC970 */
+               if (large)
+                       mask = 0xFFFFFF000ULL;
+       }
+       /* flush this VA on all vcpus */
+       kvm_for_each_vcpu(i, v, vcpu->kvm)
+               kvmppc_mmu_pte_vflush(v, va >> 12, mask);
 }
 
+#ifdef CONFIG_PPC_64K_PAGES
+static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid)
+{
+       ulong mp_ea = vcpu->arch.magic_page_ea;
+
+       return mp_ea && !(vcpu->arch.shared->msr & MSR_PR) &&
+               (mp_ea >> SID_SHIFT) == esid;
+}
+#endif
+
 static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
                                             u64 *vsid)
 {
@@ -475,11 +575,13 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
        struct kvmppc_slb *slb;
        u64 gvsid = esid;
        ulong mp_ea = vcpu->arch.magic_page_ea;
+       int pagesize = MMU_PAGE_64K;
 
        if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
                slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
                if (slb) {
                        gvsid = slb->vsid;
+                       pagesize = slb->base_page_size;
                        if (slb->tb) {
                                gvsid <<= SID_SHIFT_1T - SID_SHIFT;
                                gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1);
@@ -490,28 +592,41 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 
        switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
        case 0:
-               *vsid = VSID_REAL | esid;
+               gvsid = VSID_REAL | esid;
                break;
        case MSR_IR:
-               *vsid = VSID_REAL_IR | gvsid;
+               gvsid |= VSID_REAL_IR;
                break;
        case MSR_DR:
-               *vsid = VSID_REAL_DR | gvsid;
+               gvsid |= VSID_REAL_DR;
                break;
        case MSR_DR|MSR_IR:
                if (!slb)
                        goto no_slb;
 
-               *vsid = gvsid;
                break;
        default:
                BUG();
                break;
        }
 
+#ifdef CONFIG_PPC_64K_PAGES
+       /*
+        * Mark this as a 64k segment if the host is using
+        * 64k pages, the host MMU supports 64k pages and
+        * the guest segment page size is >= 64k,
+        * but not if this segment contains the magic page.
+        */
+       if (pagesize >= MMU_PAGE_64K &&
+           mmu_psize_defs[MMU_PAGE_64K].shift &&
+           !segment_contains_magic_page(vcpu, esid))
+               gvsid |= VSID_64K;
+#endif
+
        if (vcpu->arch.shared->msr & MSR_PR)
-               *vsid |= VSID_PR;
+               gvsid |= VSID_PR;
 
+       *vsid = gvsid;
        return 0;
 
 no_slb:
index e524052..0d513af 100644 (file)
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
-#include "trace.h"
+#include "trace_pr.h"
 
 #define PTE_SIZE 12
 
 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
        ppc_md.hpte_invalidate(pte->slot, pte->host_vpn,
-                              MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M,
+                              pte->pagesize, pte->pagesize, MMU_SEGSIZE_256M,
                               false);
 }
 
@@ -78,7 +78,8 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
        return NULL;
 }
 
-int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
+                       bool iswrite)
 {
        unsigned long vpn;
        pfn_t hpaddr;
@@ -90,16 +91,26 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
        int attempt = 0;
        struct kvmppc_sid_map *map;
        int r = 0;
+       int hpsize = MMU_PAGE_4K;
+       bool writable;
+       unsigned long mmu_seq;
+       struct kvm *kvm = vcpu->kvm;
+       struct hpte_cache *cpte;
+       unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT;
+       unsigned long pfn;
+
+       /* used to check for invalidations in progress */
+       mmu_seq = kvm->mmu_notifier_seq;
+       smp_rmb();
 
        /* Get host physical address for gpa */
-       hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
-       if (is_error_noslot_pfn(hpaddr)) {
-               printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
+       pfn = kvmppc_gfn_to_pfn(vcpu, gfn, iswrite, &writable);
+       if (is_error_noslot_pfn(pfn)) {
+               printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", gfn);
                r = -EINVAL;
                goto out;
        }
-       hpaddr <<= PAGE_SHIFT;
-       hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
+       hpaddr = pfn << PAGE_SHIFT;
 
        /* and write the mapping ea -> hpa into the pt */
        vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
@@ -117,20 +128,39 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
                goto out;
        }
 
-       vsid = map->host_vsid;
-       vpn = hpt_vpn(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M);
+       vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M);
 
-       if (!orig_pte->may_write)
-               rflags |= HPTE_R_PP;
-       else
-               mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+       kvm_set_pfn_accessed(pfn);
+       if (!orig_pte->may_write || !writable)
+               rflags |= PP_RXRX;
+       else {
+               mark_page_dirty(vcpu->kvm, gfn);
+               kvm_set_pfn_dirty(pfn);
+       }
 
        if (!orig_pte->may_execute)
                rflags |= HPTE_R_N;
        else
-               kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT);
+               kvmppc_mmu_flush_icache(pfn);
+
+       /*
+        * Use 64K pages if possible; otherwise, on 64K page kernels,
+        * we need to transfer 4 more bits from guest real to host real addr.
+        */
+       if (vsid & VSID_64K)
+               hpsize = MMU_PAGE_64K;
+       else
+               hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
+
+       hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M);
 
-       hash = hpt_hash(vpn, PTE_SIZE, MMU_SEGSIZE_256M);
+       cpte = kvmppc_mmu_hpte_cache_next(vcpu);
+
+       spin_lock(&kvm->mmu_lock);
+       if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) {
+               r = -EAGAIN;
+               goto out_unlock;
+       }
 
 map_again:
        hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
@@ -139,11 +169,11 @@ map_again:
        if (attempt > 1)
                if (ppc_md.hpte_remove(hpteg) < 0) {
                        r = -1;
-                       goto out;
+                       goto out_unlock;
                }
 
        ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags,
-                                MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M);
+                                hpsize, hpsize, MMU_SEGSIZE_256M);
 
        if (ret < 0) {
                /* If we couldn't map a primary PTE, try a secondary */
@@ -152,8 +182,6 @@ map_again:
                attempt++;
                goto map_again;
        } else {
-               struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
-
                trace_kvm_book3s_64_mmu_map(rflags, hpteg,
                                            vpn, hpaddr, orig_pte);
 
@@ -164,19 +192,37 @@ map_again:
                        hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
                }
 
-               pte->slot = hpteg + (ret & 7);
-               pte->host_vpn = vpn;
-               pte->pte = *orig_pte;
-               pte->pfn = hpaddr >> PAGE_SHIFT;
+               cpte->slot = hpteg + (ret & 7);
+               cpte->host_vpn = vpn;
+               cpte->pte = *orig_pte;
+               cpte->pfn = pfn;
+               cpte->pagesize = hpsize;
 
-               kvmppc_mmu_hpte_cache_map(vcpu, pte);
+               kvmppc_mmu_hpte_cache_map(vcpu, cpte);
+               cpte = NULL;
        }
-       kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
+
+out_unlock:
+       spin_unlock(&kvm->mmu_lock);
+       kvm_release_pfn_clean(pfn);
+       if (cpte)
+               kvmppc_mmu_hpte_cache_free(cpte);
 
 out:
        return r;
 }
 
+void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+       u64 mask = 0xfffffffffULL;
+       u64 vsid;
+
+       vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid);
+       if (vsid & VSID_64K)
+               mask = 0xffffffff0ULL;
+       kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask);
+}
+
 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 {
        struct kvmppc_sid_map *map;
@@ -291,6 +337,12 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
        slb_vsid &= ~SLB_VSID_KP;
        slb_esid |= slb_index;
 
+#ifdef CONFIG_PPC_64K_PAGES
+       /* Set host segment base page size to 64K if possible */
+       if (gvsid & VSID_64K)
+               slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp;
+#endif
+
        svcpu->slb[slb_index].esid = slb_esid;
        svcpu->slb[slb_index].vsid = slb_vsid;
 
@@ -326,7 +378,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
        svcpu_put(svcpu);
 }
 
-void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu)
 {
        kvmppc_mmu_hpte_destroy(vcpu);
        __destroy_context(to_book3s(vcpu)->context_id[0]);
index 043eec8..f3ff587 100644 (file)
@@ -260,10 +260,6 @@ int kvmppc_mmu_hv_init(void)
        return 0;
 }
 
-void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-}
-
 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 {
        kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
@@ -451,7 +447,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
 }
 
 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                       struct kvmppc_pte *gpte, bool data)
+                       struct kvmppc_pte *gpte, bool data, bool iswrite)
 {
        struct kvm *kvm = vcpu->kvm;
        struct kvmppc_slb *slbe;
@@ -906,21 +902,22 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
        return 0;
 }
 
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
 {
        if (kvm->arch.using_mmu_notifiers)
                kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
        return 0;
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 {
        if (kvm->arch.using_mmu_notifiers)
                kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
        return 0;
 }
 
-void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
+                                 struct kvm_memory_slot *memslot)
 {
        unsigned long *rmapp;
        unsigned long gfn;
@@ -994,7 +991,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        return ret;
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva)
 {
        if (!kvm->arch.using_mmu_notifiers)
                return 0;
@@ -1032,14 +1029,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        return ret;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
 {
        if (!kvm->arch.using_mmu_notifiers)
                return 0;
        return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
 }
 
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
        if (!kvm->arch.using_mmu_notifiers)
                return;
@@ -1512,9 +1509,8 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 
                                kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
                                        (VRMA_VSID << SLB_VSID_SHIFT_1T);
-                               lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
-                               lpcr |= senc << (LPCR_VRMASD_SH - 4);
-                               kvm->arch.lpcr = lpcr;
+                               lpcr = senc << (LPCR_VRMASD_SH - 4);
+                               kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
                                rma_setup = 1;
                        }
                        ++i;
index 30c2f3b..2c25f54 100644 (file)
@@ -74,3 +74,4 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
        /* Didn't find the liobn, punt it to userspace */
        return H_TOO_HARD;
 }
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
index 360ce68..99d40f8 100644 (file)
@@ -86,8 +86,8 @@ static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level)
        return true;
 }
 
-int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                           unsigned int inst, int *advance)
+int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                             unsigned int inst, int *advance)
 {
        int emulated = EMULATE_DONE;
        int rt = get_rt(inst);
@@ -172,7 +172,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        vcpu->arch.mmu.tlbie(vcpu, addr, large);
                        break;
                }
-#ifdef CONFIG_KVM_BOOK3S_64_PR
+#ifdef CONFIG_PPC_BOOK3S_64
                case OP_31_XOP_FAKE_SC1:
                {
                        /* SC 1 papr hypercalls */
@@ -267,12 +267,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
                        r = kvmppc_st(vcpu, &addr, 32, zeros, true);
                        if ((r == -ENOENT) || (r == -EPERM)) {
-                               struct kvmppc_book3s_shadow_vcpu *svcpu;
-
-                               svcpu = svcpu_get(vcpu);
                                *advance = 0;
                                vcpu->arch.shared->dar = vaddr;
-                               svcpu->fault_dar = vaddr;
+                               vcpu->arch.fault_dar = vaddr;
 
                                dsisr = DSISR_ISSTORE;
                                if (r == -ENOENT)
@@ -281,8 +278,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                        dsisr |= DSISR_PROTFAULT;
 
                                vcpu->arch.shared->dsisr = dsisr;
-                               svcpu->fault_dsisr = dsisr;
-                               svcpu_put(svcpu);
+                               vcpu->arch.fault_dsisr = dsisr;
 
                                kvmppc_book3s_queue_irqprio(vcpu,
                                        BOOK3S_INTERRUPT_DATA_STORAGE);
@@ -349,7 +345,7 @@ static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn)
        return bat;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
        int emulated = EMULATE_DONE;
 
@@ -472,7 +468,7 @@ unprivileged:
        return emulated;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
        int emulated = EMULATE_DONE;
 
index 7057a02..852989a 100644 (file)
 #include <linux/export.h>
 #include <asm/kvm_book3s.h>
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
-#else
+#endif
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);
 EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
 #ifdef CONFIG_ALTIVEC
index 62a2b5a..072287f 100644 (file)
@@ -52,6 +52,9 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
+#include <linux/module.h>
+
+#include "book3s.h"
 
 /* #define EXIT_DEBUG */
 /* #define EXIT_DEBUG_SIMPLE */
@@ -66,7 +69,7 @@
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
-void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
+static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
        int me;
        int cpu = vcpu->cpu;
@@ -125,7 +128,7 @@ void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
  * purely defensive; they should never fail.)
  */
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
@@ -143,7 +146,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        spin_unlock(&vcpu->arch.tbacct_lock);
 }
 
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
@@ -155,17 +158,46 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
        spin_unlock(&vcpu->arch.tbacct_lock);
 }
 
-void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
 {
        vcpu->arch.shregs.msr = msr;
        kvmppc_end_cede(vcpu);
 }
 
-void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
 {
        vcpu->arch.pvr = pvr;
 }
 
+int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
+{
+       unsigned long pcr = 0;
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       if (arch_compat) {
+               if (!cpu_has_feature(CPU_FTR_ARCH_206))
+                       return -EINVAL; /* 970 has no compat mode support */
+
+               switch (arch_compat) {
+               case PVR_ARCH_205:
+                       pcr = PCR_ARCH_205;
+                       break;
+               case PVR_ARCH_206:
+               case PVR_ARCH_206p:
+                       break;
+               default:
+                       return -EINVAL;
+               }
+       }
+
+       spin_lock(&vc->lock);
+       vc->arch_compat = arch_compat;
+       vc->pcr = pcr;
+       spin_unlock(&vc->lock);
+
+       return 0;
+}
+
 void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -195,7 +227,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
                pr_err("  ESID = %.16llx VSID = %.16llx\n",
                       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
        pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
-              vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
+              vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
               vcpu->arch.last_inst);
 }
 
@@ -489,7 +521,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
        memset(dt, 0, sizeof(struct dtl_entry));
        dt->dispatch_reason = 7;
        dt->processor_id = vc->pcpu + vcpu->arch.ptid;
-       dt->timebase = now;
+       dt->timebase = now + vc->tb_offset;
        dt->enqueue_to_dispatch_time = stolen;
        dt->srr0 = kvmppc_get_pc(vcpu);
        dt->srr1 = vcpu->arch.shregs.msr;
@@ -538,6 +570,15 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                }
                break;
        case H_CONFER:
+               target = kvmppc_get_gpr(vcpu, 4);
+               if (target == -1)
+                       break;
+               tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+               if (!tvcpu) {
+                       ret = H_PARAMETER;
+                       break;
+               }
+               kvm_vcpu_yield_to(tvcpu);
                break;
        case H_REGISTER_VPA:
                ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
@@ -576,8 +617,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
        return RESUME_GUEST;
 }
 
-static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                             struct task_struct *tsk)
+static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                struct task_struct *tsk)
 {
        int r = RESUME_HOST;
 
@@ -671,16 +712,16 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
                        vcpu->arch.trap, kvmppc_get_pc(vcpu),
                        vcpu->arch.shregs.msr);
+               run->hw.hardware_exit_reason = vcpu->arch.trap;
                r = RESUME_HOST;
-               BUG();
                break;
        }
 
        return r;
 }
 
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                 struct kvm_sregs *sregs)
+static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
+                                           struct kvm_sregs *sregs)
 {
        int i;
 
@@ -694,12 +735,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                 struct kvm_sregs *sregs)
+static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
+                                           struct kvm_sregs *sregs)
 {
        int i, j;
 
-       kvmppc_set_pvr(vcpu, sregs->pvr);
+       kvmppc_set_pvr_hv(vcpu, sregs->pvr);
 
        j = 0;
        for (i = 0; i < vcpu->arch.slb_nr; i++) {
@@ -714,7 +755,23 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
+static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       u64 mask;
+
+       spin_lock(&vc->lock);
+       /*
+        * Userspace can only modify DPFD (default prefetch depth),
+        * ILE (interrupt little-endian) and TC (translation control).
+        */
+       mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
+       vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
+       spin_unlock(&vc->lock);
+}
+
+static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
+                                union kvmppc_one_reg *val)
 {
        int r = 0;
        long int i;
@@ -749,6 +806,12 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
                i = id - KVM_REG_PPC_PMC1;
                *val = get_reg_val(id, vcpu->arch.pmc[i]);
                break;
+       case KVM_REG_PPC_SIAR:
+               *val = get_reg_val(id, vcpu->arch.siar);
+               break;
+       case KVM_REG_PPC_SDAR:
+               *val = get_reg_val(id, vcpu->arch.sdar);
+               break;
 #ifdef CONFIG_VSX
        case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
                if (cpu_has_feature(CPU_FTR_VSX)) {
@@ -787,6 +850,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
                val->vpaval.length = vcpu->arch.dtl.len;
                spin_unlock(&vcpu->arch.vpa_update_lock);
                break;
+       case KVM_REG_PPC_TB_OFFSET:
+               *val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
+               break;
+       case KVM_REG_PPC_LPCR:
+               *val = get_reg_val(id, vcpu->arch.vcore->lpcr);
+               break;
+       case KVM_REG_PPC_PPR:
+               *val = get_reg_val(id, vcpu->arch.ppr);
+               break;
+       case KVM_REG_PPC_ARCH_COMPAT:
+               *val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -795,7 +870,8 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
        return r;
 }
 
-int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
+static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
+                                union kvmppc_one_reg *val)
 {
        int r = 0;
        long int i;
@@ -833,6 +909,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
                i = id - KVM_REG_PPC_PMC1;
                vcpu->arch.pmc[i] = set_reg_val(id, *val);
                break;
+       case KVM_REG_PPC_SIAR:
+               vcpu->arch.siar = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_SDAR:
+               vcpu->arch.sdar = set_reg_val(id, *val);
+               break;
 #ifdef CONFIG_VSX
        case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
                if (cpu_has_feature(CPU_FTR_VSX)) {
@@ -880,6 +962,20 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
                len -= len % sizeof(struct dtl_entry);
                r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
                break;
+       case KVM_REG_PPC_TB_OFFSET:
+               /* round up to multiple of 2^24 */
+               vcpu->arch.vcore->tb_offset =
+                       ALIGN(set_reg_val(id, *val), 1UL << 24);
+               break;
+       case KVM_REG_PPC_LPCR:
+               kvmppc_set_lpcr(vcpu, set_reg_val(id, *val));
+               break;
+       case KVM_REG_PPC_PPR:
+               vcpu->arch.ppr = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_ARCH_COMPAT:
+               r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
+               break;
        default:
                r = -EINVAL;
                break;
@@ -888,14 +984,8 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
        return r;
 }
 
-int kvmppc_core_check_processor_compat(void)
-{
-       if (cpu_has_feature(CPU_FTR_HVMODE))
-               return 0;
-       return -EIO;
-}
-
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
+                                                  unsigned int id)
 {
        struct kvm_vcpu *vcpu;
        int err = -EINVAL;
@@ -919,8 +1009,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
        vcpu->arch.mmcr[0] = MMCR0_FC;
        vcpu->arch.ctrl = CTRL_RUNLATCH;
        /* default to host PVR, since we can't spoof it */
-       vcpu->arch.pvr = mfspr(SPRN_PVR);
-       kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+       kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
        spin_lock_init(&vcpu->arch.vpa_update_lock);
        spin_lock_init(&vcpu->arch.tbacct_lock);
        vcpu->arch.busy_preempt = TB_NIL;
@@ -940,6 +1029,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
                        spin_lock_init(&vcore->lock);
                        init_waitqueue_head(&vcore->wq);
                        vcore->preempt_tb = TB_NIL;
+                       vcore->lpcr = kvm->arch.lpcr;
                }
                kvm->arch.vcores[core] = vcore;
                kvm->arch.online_vcores++;
@@ -972,7 +1062,7 @@ static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
                                        vpa->dirty);
 }
 
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
 {
        spin_lock(&vcpu->arch.vpa_update_lock);
        unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
@@ -983,6 +1073,12 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
        kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
+static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
+{
+       /* Indicate we want to get back into the guest */
+       return 1;
+}
+
 static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 {
        unsigned long dec_nsec, now;
@@ -1264,8 +1360,8 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 
                ret = RESUME_GUEST;
                if (vcpu->arch.trap)
-                       ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
-                                                vcpu->arch.run_task);
+                       ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
+                                                   vcpu->arch.run_task);
 
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
@@ -1424,7 +1520,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        return vcpu->arch.ret;
 }
 
-int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        int r;
        int srcu_idx;
@@ -1546,7 +1642,8 @@ static const struct file_operations kvm_rma_fops = {
        .release        = kvm_rma_release,
 };
 
-long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
+static long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
+                                     struct kvm_allocate_rma *ret)
 {
        long fd;
        struct kvm_rma_info *ri;
@@ -1592,7 +1689,8 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
        (*sps)++;
 }
 
-int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
+static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
+                                        struct kvm_ppc_smmu_info *info)
 {
        struct kvm_ppc_one_seg_page_size *sps;
 
@@ -1613,7 +1711,8 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
+                                        struct kvm_dirty_log *log)
 {
        struct kvm_memory_slot *memslot;
        int r;
@@ -1667,8 +1766,8 @@ static void unpin_slot(struct kvm_memory_slot *memslot)
        }
 }
 
-void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
-                             struct kvm_memory_slot *dont)
+static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
+                                       struct kvm_memory_slot *dont)
 {
        if (!dont || free->arch.rmap != dont->arch.rmap) {
                vfree(free->arch.rmap);
@@ -1681,8 +1780,8 @@ void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
        }
 }
 
-int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
-                              unsigned long npages)
+static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
+                                        unsigned long npages)
 {
        slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
        if (!slot->arch.rmap)
@@ -1692,9 +1791,9 @@ int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
        return 0;
 }
 
-int kvmppc_core_prepare_memory_region(struct kvm *kvm,
-                                     struct kvm_memory_slot *memslot,
-                                     struct kvm_userspace_memory_region *mem)
+static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot,
+                                       struct kvm_userspace_memory_region *mem)
 {
        unsigned long *phys;
 
@@ -1710,9 +1809,9 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
        return 0;
 }
 
-void kvmppc_core_commit_memory_region(struct kvm *kvm,
-                                     struct kvm_userspace_memory_region *mem,
-                                     const struct kvm_memory_slot *old)
+static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_memory_slot *old)
 {
        unsigned long npages = mem->memory_size >> PAGE_SHIFT;
        struct kvm_memory_slot *memslot;
@@ -1729,6 +1828,37 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
        }
 }
 
+/*
+ * Update LPCR values in kvm->arch and in vcores.
+ * Caller must hold kvm->lock.
+ */
+void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
+{
+       long int i;
+       u32 cores_done = 0;
+
+       if ((kvm->arch.lpcr & mask) == lpcr)
+               return;
+
+       kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
+
+       for (i = 0; i < KVM_MAX_VCORES; ++i) {
+               struct kvmppc_vcore *vc = kvm->arch.vcores[i];
+               if (!vc)
+                       continue;
+               spin_lock(&vc->lock);
+               vc->lpcr = (vc->lpcr & ~mask) | lpcr;
+               spin_unlock(&vc->lock);
+               if (++cores_done >= kvm->arch.online_vcores)
+                       break;
+       }
+}
+
+static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
+{
+       return;
+}
+
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
        int err = 0;
@@ -1737,7 +1867,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
        unsigned long hva;
        struct kvm_memory_slot *memslot;
        struct vm_area_struct *vma;
-       unsigned long lpcr, senc;
+       unsigned long lpcr = 0, senc;
+       unsigned long lpcr_mask = 0;
        unsigned long psize, porder;
        unsigned long rma_size;
        unsigned long rmls;
@@ -1802,9 +1933,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
                senc = slb_pgsize_encoding(psize);
                kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
                        (VRMA_VSID << SLB_VSID_SHIFT_1T);
-               lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
-               lpcr |= senc << (LPCR_VRMASD_SH - 4);
-               kvm->arch.lpcr = lpcr;
+               lpcr_mask = LPCR_VRMASD;
+               /* the -4 is to account for senc values starting at 0x10 */
+               lpcr = senc << (LPCR_VRMASD_SH - 4);
 
                /* Create HPTEs in the hash page table for the VRMA */
                kvmppc_map_vrma(vcpu, memslot, porder);
@@ -1825,23 +1956,21 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
                kvm->arch.rma = ri;
 
                /* Update LPCR and RMOR */
-               lpcr = kvm->arch.lpcr;
                if (cpu_has_feature(CPU_FTR_ARCH_201)) {
                        /* PPC970; insert RMLS value (split field) in HID4 */
-                       lpcr &= ~((1ul << HID4_RMLS0_SH) |
-                                 (3ul << HID4_RMLS2_SH));
-                       lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
+                       lpcr_mask = (1ul << HID4_RMLS0_SH) |
+                               (3ul << HID4_RMLS2_SH) | HID4_RMOR;
+                       lpcr = ((rmls >> 2) << HID4_RMLS0_SH) |
                                ((rmls & 3) << HID4_RMLS2_SH);
                        /* RMOR is also in HID4 */
                        lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
                                << HID4_RMOR_SH;
                } else {
                        /* POWER7 */
-                       lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
-                       lpcr |= rmls << LPCR_RMLS_SH;
+                       lpcr_mask = LPCR_VPM0 | LPCR_VRMA_L | LPCR_RMLS;
+                       lpcr = rmls << LPCR_RMLS_SH;
                        kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT;
                }
-               kvm->arch.lpcr = lpcr;
                pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
                        ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 
@@ -1860,6 +1989,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
                }
        }
 
+       kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
+
        /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
        smp_wmb();
        kvm->arch.rma_setup_done = 1;
@@ -1875,7 +2006,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
        goto out_srcu;
 }
 
-int kvmppc_core_init_vm(struct kvm *kvm)
+static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
        unsigned long lpcr, lpid;
 
@@ -1893,9 +2024,6 @@ int kvmppc_core_init_vm(struct kvm *kvm)
         */
        cpumask_setall(&kvm->arch.need_tlb_flush);
 
-       INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
-       INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
-
        kvm->arch.rma = NULL;
 
        kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
@@ -1931,61 +2059,162 @@ int kvmppc_core_init_vm(struct kvm *kvm)
        return 0;
 }
 
-void kvmppc_core_destroy_vm(struct kvm *kvm)
+static void kvmppc_free_vcores(struct kvm *kvm)
+{
+       long int i;
+
+       for (i = 0; i < KVM_MAX_VCORES; ++i)
+               kfree(kvm->arch.vcores[i]);
+       kvm->arch.online_vcores = 0;
+}
+
+static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
        uninhibit_secondary_onlining();
 
+       kvmppc_free_vcores(kvm);
        if (kvm->arch.rma) {
                kvm_release_rma(kvm->arch.rma);
                kvm->arch.rma = NULL;
        }
 
-       kvmppc_rtas_tokens_free(kvm);
-
        kvmppc_free_hpt(kvm);
-       WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
 
-/* These are stubs for now */
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+/* We don't need to emulate any privileged instructions or dcbz */
+static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                    unsigned int inst, int *advance)
 {
+       return EMULATE_FAIL;
 }
 
-/* We don't need to emulate any privileged instructions or dcbz */
-int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                           unsigned int inst, int *advance)
+static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
+                                       ulong spr_val)
 {
        return EMULATE_FAIL;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
+                                       ulong *spr_val)
 {
        return EMULATE_FAIL;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+static int kvmppc_core_check_processor_compat_hv(void)
 {
-       return EMULATE_FAIL;
+       if (!cpu_has_feature(CPU_FTR_HVMODE))
+               return -EIO;
+       return 0;
 }
 
-static int kvmppc_book3s_hv_init(void)
+static long kvm_arch_vm_ioctl_hv(struct file *filp,
+                                unsigned int ioctl, unsigned long arg)
 {
-       int r;
+       struct kvm *kvm __maybe_unused = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       long r;
 
-       r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+       switch (ioctl) {
 
-       if (r)
+       case KVM_ALLOCATE_RMA: {
+               struct kvm_allocate_rma rma;
+               struct kvm *kvm = filp->private_data;
+
+               r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
+               if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
+                       r = -EFAULT;
+               break;
+       }
+
+       case KVM_PPC_ALLOCATE_HTAB: {
+               u32 htab_order;
+
+               r = -EFAULT;
+               if (get_user(htab_order, (u32 __user *)argp))
+                       break;
+               r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+               if (r)
+                       break;
+               r = -EFAULT;
+               if (put_user(htab_order, (u32 __user *)argp))
+                       break;
+               r = 0;
+               break;
+       }
+
+       case KVM_PPC_GET_HTAB_FD: {
+               struct kvm_get_htab_fd ghf;
+
+               r = -EFAULT;
+               if (copy_from_user(&ghf, argp, sizeof(ghf)))
+                       break;
+               r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
+               break;
+       }
+
+       default:
+               r = -ENOTTY;
+       }
+
+       return r;
+}
+
+static struct kvmppc_ops kvm_ops_hv = {
+       .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
+       .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
+       .get_one_reg = kvmppc_get_one_reg_hv,
+       .set_one_reg = kvmppc_set_one_reg_hv,
+       .vcpu_load   = kvmppc_core_vcpu_load_hv,
+       .vcpu_put    = kvmppc_core_vcpu_put_hv,
+       .set_msr     = kvmppc_set_msr_hv,
+       .vcpu_run    = kvmppc_vcpu_run_hv,
+       .vcpu_create = kvmppc_core_vcpu_create_hv,
+       .vcpu_free   = kvmppc_core_vcpu_free_hv,
+       .check_requests = kvmppc_core_check_requests_hv,
+       .get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
+       .flush_memslot  = kvmppc_core_flush_memslot_hv,
+       .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
+       .commit_memory_region  = kvmppc_core_commit_memory_region_hv,
+       .unmap_hva = kvm_unmap_hva_hv,
+       .unmap_hva_range = kvm_unmap_hva_range_hv,
+       .age_hva  = kvm_age_hva_hv,
+       .test_age_hva = kvm_test_age_hva_hv,
+       .set_spte_hva = kvm_set_spte_hva_hv,
+       .mmu_destroy  = kvmppc_mmu_destroy_hv,
+       .free_memslot = kvmppc_core_free_memslot_hv,
+       .create_memslot = kvmppc_core_create_memslot_hv,
+       .init_vm =  kvmppc_core_init_vm_hv,
+       .destroy_vm = kvmppc_core_destroy_vm_hv,
+       .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
+       .emulate_op = kvmppc_core_emulate_op_hv,
+       .emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
+       .emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
+       .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
+       .arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
+};
+
+static int kvmppc_book3s_init_hv(void)
+{
+       int r;
+       /*
+        * FIXME!! Do we need to check on all cpus ?
+        */
+       r = kvmppc_core_check_processor_compat_hv();
+       if (r < 0)
                return r;
 
-       r = kvmppc_mmu_hv_init();
+       kvm_ops_hv.owner = THIS_MODULE;
+       kvmppc_hv_ops = &kvm_ops_hv;
 
+       r = kvmppc_mmu_hv_init();
        return r;
 }
 
-static void kvmppc_book3s_hv_exit(void)
+static void kvmppc_book3s_exit_hv(void)
 {
-       kvm_exit();
+       kvmppc_hv_ops = NULL;
 }
 
-module_init(kvmppc_book3s_hv_init);
-module_exit(kvmppc_book3s_hv_exit);
+module_init(kvmppc_book3s_init_hv);
+module_exit(kvmppc_book3s_exit_hv);
+MODULE_LICENSE("GPL");
index 37f1cc4..928142c 100644 (file)
@@ -158,9 +158,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
  * Interrupts are enabled again at this point.
  */
 
-.global kvmppc_handler_highmem
-kvmppc_handler_highmem:
-
        /*
         * Register usage at this point:
         *
index c71103b..bc8de75 100644 (file)
 #error Need to fix lppaca and SLB shadow accesses in little endian mode
 #endif
 
-/*****************************************************************************
- *                                                                           *
- *        Real Mode handlers that need to be in the linear mapping           *
- *                                                                           *
- ****************************************************************************/
-
-       .globl  kvmppc_skip_interrupt
-kvmppc_skip_interrupt:
-       mfspr   r13,SPRN_SRR0
-       addi    r13,r13,4
-       mtspr   SPRN_SRR0,r13
-       GET_SCRATCH0(r13)
-       rfid
-       b       .
-
-       .globl  kvmppc_skip_Hinterrupt
-kvmppc_skip_Hinterrupt:
-       mfspr   r13,SPRN_HSRR0
-       addi    r13,r13,4
-       mtspr   SPRN_HSRR0,r13
-       GET_SCRATCH0(r13)
-       hrfid
-       b       .
-
 /*
  * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
@@ -66,8 +42,11 @@ kvmppc_skip_Hinterrupt:
  * LR = return address to continue at after eventually re-enabling MMU
  */
 _GLOBAL(kvmppc_hv_entry_trampoline)
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -112(r1)
        mfmsr   r10
-       LOAD_REG_ADDR(r5, kvmppc_hv_entry)
+       LOAD_REG_ADDR(r5, kvmppc_call_hv_entry)
        li      r0,MSR_RI
        andc    r0,r10,r0
        li      r6,MSR_IR | MSR_DR
@@ -77,11 +56,103 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
        mtsrr1  r6
        RFI
 
-/******************************************************************************
- *                                                                            *
- *                               Entry code                                   *
- *                                                                            *
- *****************************************************************************/
+kvmppc_call_hv_entry:
+       bl      kvmppc_hv_entry
+
+       /* Back from guest - restore host state and return to caller */
+
+       /* Restore host DABR and DABRX */
+       ld      r5,HSTATE_DABR(r13)
+       li      r6,7
+       mtspr   SPRN_DABR,r5
+       mtspr   SPRN_DABRX,r6
+
+       /* Restore SPRG3 */
+       ld      r3,PACA_SPRG3(r13)
+       mtspr   SPRN_SPRG3,r3
+
+       /*
+        * Reload DEC.  HDEC interrupts were disabled when
+        * we reloaded the host's LPCR value.
+        */
+       ld      r3, HSTATE_DECEXP(r13)
+       mftb    r4
+       subf    r4, r4, r3
+       mtspr   SPRN_DEC, r4
+
+       /* Reload the host's PMU registers */
+       ld      r3, PACALPPACAPTR(r13)  /* is the host using the PMU? */
+       lbz     r4, LPPACA_PMCINUSE(r3)
+       cmpwi   r4, 0
+       beq     23f                     /* skip if not */
+       lwz     r3, HSTATE_PMC(r13)
+       lwz     r4, HSTATE_PMC + 4(r13)
+       lwz     r5, HSTATE_PMC + 8(r13)
+       lwz     r6, HSTATE_PMC + 12(r13)
+       lwz     r8, HSTATE_PMC + 16(r13)
+       lwz     r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+       lwz     r10, HSTATE_PMC + 24(r13)
+       lwz     r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       mtspr   SPRN_PMC1, r3
+       mtspr   SPRN_PMC2, r4
+       mtspr   SPRN_PMC3, r5
+       mtspr   SPRN_PMC4, r6
+       mtspr   SPRN_PMC5, r8
+       mtspr   SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+       mtspr   SPRN_PMC7, r10
+       mtspr   SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       ld      r3, HSTATE_MMCR(r13)
+       ld      r4, HSTATE_MMCR + 8(r13)
+       ld      r5, HSTATE_MMCR + 16(r13)
+       mtspr   SPRN_MMCR1, r4
+       mtspr   SPRN_MMCRA, r5
+       mtspr   SPRN_MMCR0, r3
+       isync
+23:
+
+       /*
+        * For external and machine check interrupts, we need
+        * to call the Linux handler to process the interrupt.
+        * We do that by jumping to absolute address 0x500 for
+        * external interrupts, or the machine_check_fwnmi label
+        * for machine checks (since firmware might have patched
+        * the vector area at 0x200).  The [h]rfid at the end of the
+        * handler will return to the book3s_hv_interrupts.S code.
+        * For other interrupts we do the rfid to get back
+        * to the book3s_hv_interrupts.S code here.
+        */
+       ld      r8, 112+PPC_LR_STKOFF(r1)
+       addi    r1, r1, 112
+       ld      r7, HSTATE_HOST_MSR(r13)
+
+       cmpwi   cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+       cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
+BEGIN_FTR_SECTION
+       beq     11f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* RFI into the highmem handler, or branch to interrupt handler */
+       mfmsr   r6
+       li      r0, MSR_RI
+       andc    r6, r6, r0
+       mtmsrd  r6, 1                   /* Clear RI in MSR */
+       mtsrr0  r8
+       mtsrr1  r7
+       beqa    0x500                   /* external interrupt (PPC970) */
+       beq     cr1, 13f                /* machine check */
+       RFI
+
+       /* On POWER7, we have external interrupts set to use HSRR0/1 */
+11:    mtspr   SPRN_HSRR0, r8
+       mtspr   SPRN_HSRR1, r7
+       ba      0x500
+
+13:    b       machine_check_fwnmi
+
 
 /*
  * We come in here when wakened from nap mode on a secondary hw thread.
@@ -137,7 +208,7 @@ kvm_start_guest:
        cmpdi   r4,0
        /* if we have no vcpu to run, go back to sleep */
        beq     kvm_no_guest
-       b       kvmppc_hv_entry
+       b       30f
 
 27:    /* XXX should handle hypervisor maintenance interrupts etc. here */
        b       kvm_no_guest
@@ -147,6 +218,57 @@ kvm_start_guest:
        stw     r8,HSTATE_SAVED_XIRR(r13)
        b       kvm_no_guest
 
+30:    bl      kvmppc_hv_entry
+
+       /* Back from the guest, go back to nap */
+       /* Clear our vcpu pointer so we don't come back in early */
+       li      r0, 0
+       std     r0, HSTATE_KVM_VCPU(r13)
+       lwsync
+       /* Clear any pending IPI - we're an offline thread */
+       ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r7, XICS_XIRR
+       lwzcix  r3, r5, r7              /* ack any pending interrupt */
+       rlwinm. r0, r3, 0, 0xffffff     /* any pending? */
+       beq     37f
+       sync
+       li      r0, 0xff
+       li      r6, XICS_MFRR
+       stbcix  r0, r5, r6              /* clear the IPI */
+       stwcix  r3, r5, r7              /* EOI it */
+37:    sync
+
+       /* increment the nap count and then go to nap mode */
+       ld      r4, HSTATE_KVM_VCORE(r13)
+       addi    r4, r4, VCORE_NAP_COUNT
+       lwsync                          /* make previous updates visible */
+51:    lwarx   r3, 0, r4
+       addi    r3, r3, 1
+       stwcx.  r3, 0, r4
+       bne     51b
+
+kvm_no_guest:
+       li      r0, KVM_HWTHREAD_IN_NAP
+       stb     r0, HSTATE_HWTHREAD_STATE(r13)
+       li      r3, LPCR_PECE0
+       mfspr   r4, SPRN_LPCR
+       rlwimi  r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
+       mtspr   SPRN_LPCR, r4
+       isync
+       std     r0, HSTATE_SCRATCH0(r13)
+       ptesync
+       ld      r0, HSTATE_SCRATCH0(r13)
+1:     cmpd    r0, r0
+       bne     1b
+       nap
+       b       .
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
 
@@ -159,7 +281,8 @@ kvmppc_hv_entry:
         * all other volatile GPRS = free
         */
        mflr    r0
-       std     r0, HSTATE_VMHANDLER(r13)
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -112(r1)
 
        /* Set partition DABR */
        /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
@@ -200,8 +323,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        ld      r3, VCPU_MMCR(r4)
        ld      r5, VCPU_MMCR + 8(r4)
        ld      r6, VCPU_MMCR + 16(r4)
+       ld      r7, VCPU_SIAR(r4)
+       ld      r8, VCPU_SDAR(r4)
        mtspr   SPRN_MMCR1, r5
        mtspr   SPRN_MMCRA, r6
+       mtspr   SPRN_SIAR, r7
+       mtspr   SPRN_SDAR, r8
        mtspr   SPRN_MMCR0, r3
        isync
 
@@ -254,22 +381,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
        /* Save R1 in the PACA */
        std     r1, HSTATE_HOST_R1(r13)
 
-       /* Increment yield count if they have a VPA */
-       ld      r3, VCPU_VPA(r4)
-       cmpdi   r3, 0
-       beq     25f
-       lwz     r5, LPPACA_YIELDCOUNT(r3)
-       addi    r5, r5, 1
-       stw     r5, LPPACA_YIELDCOUNT(r3)
-       li      r6, 1
-       stb     r6, VCPU_VPA_DIRTY(r4)
-25:
        /* Load up DAR and DSISR */
        ld      r5, VCPU_DAR(r4)
        lwz     r6, VCPU_DSISR(r4)
        mtspr   SPRN_DAR, r5
        mtspr   SPRN_DSISR, r6
 
+       li      r6, KVM_GUEST_MODE_HOST_HV
+       stb     r6, HSTATE_IN_GUEST(r13)
+
 BEGIN_FTR_SECTION
        /* Restore AMR and UAMOR, set AMOR to all 1s */
        ld      r5,VCPU_AMR(r4)
@@ -343,7 +463,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        bdnz    28b
        ptesync
 
-22:    li      r0,1
+       /* Add timebase offset onto timebase */
+22:    ld      r8,VCORE_TB_OFFSET(r5)
+       cmpdi   r8,0
+       beq     37f
+       mftb    r6              /* current host timebase */
+       add     r8,r8,r6
+       mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
+       mftb    r7              /* check if lower 24 bits overflowed */
+       clrldi  r6,r6,40
+       clrldi  r7,r7,40
+       cmpld   r7,r6
+       bge     37f
+       addis   r8,r8,0x100     /* if so, increment upper 40 bits */
+       mtspr   SPRN_TBU40,r8
+
+       /* Load guest PCR value to select appropriate compat mode */
+37:    ld      r7, VCORE_PCR(r5)
+       cmpdi   r7, 0
+       beq     38f
+       mtspr   SPRN_PCR, r7
+38:
+       li      r0,1
        stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
        b       10f
 
@@ -353,12 +494,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        beq     20b
 
        /* Set LPCR and RMOR. */
-10:    ld      r8,KVM_LPCR(r9)
+10:    ld      r8,VCORE_LPCR(r5)
        mtspr   SPRN_LPCR,r8
        ld      r8,KVM_RMOR(r9)
        mtspr   SPRN_RMOR,r8
        isync
 
+       /* Increment yield count if they have a VPA */
+       ld      r3, VCPU_VPA(r4)
+       cmpdi   r3, 0
+       beq     25f
+       lwz     r5, LPPACA_YIELDCOUNT(r3)
+       addi    r5, r5, 1
+       stw     r5, LPPACA_YIELDCOUNT(r3)
+       li      r6, 1
+       stb     r6, VCPU_VPA_DIRTY(r4)
+25:
        /* Check if HDEC expires soon */
        mfspr   r3,SPRN_HDEC
        cmpwi   r3,10
@@ -405,7 +556,8 @@ toc_tlbie_lock:
        bne     24b
        isync
 
-       ld      r7,KVM_LPCR(r9)         /* use kvm->arch.lpcr to store HID4 */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r7,VCORE_LPCR(r5)       /* use vcore->lpcr to store HID4 */
        li      r0,0x18f
        rotldi  r0,r0,HID4_LPID5_SH     /* all lpid bits in HID4 = 1 */
        or      r0,r7,r0
@@ -541,7 +693,7 @@ fast_guest_return:
        mtspr   SPRN_HSRR1,r11
 
        /* Activate guest mode, so faults get handled by KVM */
-       li      r9, KVM_GUEST_MODE_GUEST
+       li      r9, KVM_GUEST_MODE_GUEST_HV
        stb     r9, HSTATE_IN_GUEST(r13)
 
        /* Enter guest */
@@ -550,13 +702,15 @@ BEGIN_FTR_SECTION
        ld      r5, VCPU_CFAR(r4)
        mtspr   SPRN_CFAR, r5
 END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+BEGIN_FTR_SECTION
+       ld      r0, VCPU_PPR(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
        ld      r5, VCPU_LR(r4)
        lwz     r6, VCPU_CR(r4)
        mtlr    r5
        mtcr    r6
 
-       ld      r0, VCPU_GPR(R0)(r4)
        ld      r1, VCPU_GPR(R1)(r4)
        ld      r2, VCPU_GPR(R2)(r4)
        ld      r3, VCPU_GPR(R3)(r4)
@@ -570,6 +724,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
        ld      r12, VCPU_GPR(R12)(r4)
        ld      r13, VCPU_GPR(R13)(r4)
 
+BEGIN_FTR_SECTION
+       mtspr   SPRN_PPR, r0
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+       ld      r0, VCPU_GPR(R0)(r4)
        ld      r4, VCPU_GPR(R4)(r4)
 
        hrfid
@@ -584,8 +742,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 /*
  * We come here from the first-level interrupt handlers.
  */
-       .globl  kvmppc_interrupt
-kvmppc_interrupt:
+       .globl  kvmppc_interrupt_hv
+kvmppc_interrupt_hv:
        /*
         * Register contents:
         * R12          = interrupt vector
@@ -595,6 +753,19 @@ kvmppc_interrupt:
         */
        /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
        std     r9, HSTATE_HOST_R2(r13)
+
+       lbz     r9, HSTATE_IN_GUEST(r13)
+       cmpwi   r9, KVM_GUEST_MODE_HOST_HV
+       beq     kvmppc_bad_host_intr
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+       cmpwi   r9, KVM_GUEST_MODE_GUEST
+       ld      r9, HSTATE_HOST_R2(r13)
+       beq     kvmppc_interrupt_pr
+#endif
+       /* We're now back in the host but in guest MMU context */
+       li      r9, KVM_GUEST_MODE_HOST_HV
+       stb     r9, HSTATE_IN_GUEST(r13)
+
        ld      r9, HSTATE_KVM_VCPU(r13)
 
        /* Save registers */
@@ -620,6 +791,10 @@ BEGIN_FTR_SECTION
        ld      r3, HSTATE_CFAR(r13)
        std     r3, VCPU_CFAR(r9)
 END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+BEGIN_FTR_SECTION
+       ld      r4, HSTATE_PPR(r13)
+       std     r4, VCPU_PPR(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
        /* Restore R1/R2 so we can handle faults */
        ld      r1, HSTATE_HOST_R1(r13)
@@ -642,10 +817,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
        std     r3, VCPU_GPR(R13)(r9)
        std     r4, VCPU_LR(r9)
 
-       /* Unset guest mode */
-       li      r0, KVM_GUEST_MODE_NONE
-       stb     r0, HSTATE_IN_GUEST(r13)
-
        stw     r12,VCPU_TRAP(r9)
 
        /* Save HEIR (HV emulation assist reg) in last_inst
@@ -696,46 +867,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
         * set, we know the host wants us out so let's do it now
         */
 do_ext_interrupt:
-       lbz     r0, HSTATE_HOST_IPI(r13)
-       cmpwi   r0, 0
-       bne     ext_interrupt_to_host
-
-       /* Now read the interrupt from the ICP */
-       ld      r5, HSTATE_XICS_PHYS(r13)
-       li      r7, XICS_XIRR
-       cmpdi   r5, 0
-       beq-    ext_interrupt_to_host
-       lwzcix  r3, r5, r7
-       rlwinm. r0, r3, 0, 0xffffff
-       sync
-       beq     3f              /* if nothing pending in the ICP */
-
-       /* We found something in the ICP...
-        *
-        * If it's not an IPI, stash it in the PACA and return to
-        * the host, we don't (yet) handle directing real external
-        * interrupts directly to the guest
-        */
-       cmpwi   r0, XICS_IPI
-       bne     ext_stash_for_host
-
-       /* It's an IPI, clear the MFRR and EOI it */
-       li      r0, 0xff
-       li      r6, XICS_MFRR
-       stbcix  r0, r5, r6              /* clear the IPI */
-       stwcix  r3, r5, r7              /* EOI it */
-       sync
-
-       /* We need to re-check host IPI now in case it got set in the
-        * meantime. If it's clear, we bounce the interrupt to the
-        * guest
-        */
-       lbz     r0, HSTATE_HOST_IPI(r13)
-       cmpwi   r0, 0
-       bne-    1f
+       bl      kvmppc_read_intr
+       cmpdi   r3, 0
+       bgt     ext_interrupt_to_host
 
        /* Allright, looks like an IPI for the guest, we need to set MER */
-3:
        /* Check if any CPU is heading out to the host, if so head out too */
        ld      r5, HSTATE_KVM_VCORE(r13)
        lwz     r0, VCORE_ENTRY_EXIT(r5)
@@ -764,27 +900,9 @@ do_ext_interrupt:
        mtspr   SPRN_LPCR, r8
        b       fast_guest_return
 
-       /* We raced with the host, we need to resend that IPI, bummer */
-1:     li      r0, IPI_PRIORITY
-       stbcix  r0, r5, r6              /* set the IPI */
-       sync
-       b       ext_interrupt_to_host
-
-ext_stash_for_host:
-       /* It's not an IPI and it's for the host, stash it in the PACA
-        * before exit, it will be picked up by the host ICP driver
-        */
-       stw     r3, HSTATE_SAVED_XIRR(r13)
 ext_interrupt_to_host:
 
 guest_exit_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
-       /* Save DEC */
-       mfspr   r5,SPRN_DEC
-       mftb    r6
-       extsw   r5,r5
-       add     r5,r5,r6
-       std     r5,VCPU_DEC_EXPIRES(r9)
-
        /* Save more register state  */
        mfdar   r6
        mfdsisr r7
@@ -954,7 +1072,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        mtspr   SPRN_SDR1,r6            /* switch to partition page table */
        mtspr   SPRN_LPID,r7
        isync
-       li      r0,0
+
+       /* Subtract timebase offset from timebase */
+       ld      r8,VCORE_TB_OFFSET(r5)
+       cmpdi   r8,0
+       beq     17f
+       mftb    r6                      /* current host timebase */
+       subf    r8,r8,r6
+       mtspr   SPRN_TBU40,r8           /* update upper 40 bits */
+       mftb    r7                      /* check if lower 24 bits overflowed */
+       clrldi  r6,r6,40
+       clrldi  r7,r7,40
+       cmpld   r7,r6
+       bge     17f
+       addis   r8,r8,0x100             /* if so, increment upper 40 bits */
+       mtspr   SPRN_TBU40,r8
+
+       /* Reset PCR */
+17:    ld      r0, VCORE_PCR(r5)
+       cmpdi   r0, 0
+       beq     18f
+       li      r0, 0
+       mtspr   SPRN_PCR, r0
+18:
+       /* Signal secondary CPUs to continue */
        stb     r0,VCORE_IN_GUEST(r5)
        lis     r8,0x7fff               /* MAX_INT@h */
        mtspr   SPRN_HDEC,r8
@@ -1052,6 +1193,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 1:     addi    r8,r8,16
        .endr
 
+       /* Save DEC */
+       mfspr   r5,SPRN_DEC
+       mftb    r6
+       extsw   r5,r5
+       add     r5,r5,r6
+       std     r5,VCPU_DEC_EXPIRES(r9)
+
        /* Save and reset AMR and UAMOR before turning on the MMU */
 BEGIN_FTR_SECTION
        mfspr   r5,SPRN_AMR
@@ -1062,6 +1210,10 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_AMR,r6
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
+       /* Unset guest mode */
+       li      r0, KVM_GUEST_MODE_NONE
+       stb     r0, HSTATE_IN_GUEST(r13)
+
        /* Switch DSCR back to host value */
 BEGIN_FTR_SECTION
        mfspr   r8, SPRN_DSCR
@@ -1134,9 +1286,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
        std     r3, VCPU_MMCR(r9)       /* if not, set saved MMCR0 to FC */
        b       22f
 21:    mfspr   r5, SPRN_MMCR1
+       mfspr   r7, SPRN_SIAR
+       mfspr   r8, SPRN_SDAR
        std     r4, VCPU_MMCR(r9)
        std     r5, VCPU_MMCR + 8(r9)
        std     r6, VCPU_MMCR + 16(r9)
+       std     r7, VCPU_SIAR(r9)
+       std     r8, VCPU_SDAR(r9)
        mfspr   r3, SPRN_PMC1
        mfspr   r4, SPRN_PMC2
        mfspr   r5, SPRN_PMC3
@@ -1158,103 +1314,30 @@ BEGIN_FTR_SECTION
        stw     r11, VCPU_PMC + 28(r9)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 22:
+       ld      r0, 112+PPC_LR_STKOFF(r1)
+       addi    r1, r1, 112
+       mtlr    r0
+       blr
+secondary_too_late:
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       HMT_LOW
+13:    lbz     r3,VCORE_IN_GUEST(r5)
+       cmpwi   r3,0
+       bne     13b
+       HMT_MEDIUM
+       li      r0, KVM_GUEST_MODE_NONE
+       stb     r0, HSTATE_IN_GUEST(r13)
+       ld      r11,PACA_SLBSHADOWPTR(r13)
 
-       /* Secondary threads go off to take a nap on POWER7 */
-BEGIN_FTR_SECTION
-       lwz     r0,VCPU_PTID(r9)
-       cmpwi   r0,0
-       bne     secondary_nap
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-
-       /* Restore host DABR and DABRX */
-       ld      r5,HSTATE_DABR(r13)
-       li      r6,7
-       mtspr   SPRN_DABR,r5
-       mtspr   SPRN_DABRX,r6
-
-       /* Restore SPRG3 */
-       ld      r3,PACA_SPRG3(r13)
-       mtspr   SPRN_SPRG3,r3
-
-       /*
-        * Reload DEC.  HDEC interrupts were disabled when
-        * we reloaded the host's LPCR value.
-        */
-       ld      r3, HSTATE_DECEXP(r13)
-       mftb    r4
-       subf    r4, r4, r3
-       mtspr   SPRN_DEC, r4
-
-       /* Reload the host's PMU registers */
-       ld      r3, PACALPPACAPTR(r13)  /* is the host using the PMU? */
-       lbz     r4, LPPACA_PMCINUSE(r3)
-       cmpwi   r4, 0
-       beq     23f                     /* skip if not */
-       lwz     r3, HSTATE_PMC(r13)
-       lwz     r4, HSTATE_PMC + 4(r13)
-       lwz     r5, HSTATE_PMC + 8(r13)
-       lwz     r6, HSTATE_PMC + 12(r13)
-       lwz     r8, HSTATE_PMC + 16(r13)
-       lwz     r9, HSTATE_PMC + 20(r13)
-BEGIN_FTR_SECTION
-       lwz     r10, HSTATE_PMC + 24(r13)
-       lwz     r11, HSTATE_PMC + 28(r13)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
-       mtspr   SPRN_PMC1, r3
-       mtspr   SPRN_PMC2, r4
-       mtspr   SPRN_PMC3, r5
-       mtspr   SPRN_PMC4, r6
-       mtspr   SPRN_PMC5, r8
-       mtspr   SPRN_PMC6, r9
-BEGIN_FTR_SECTION
-       mtspr   SPRN_PMC7, r10
-       mtspr   SPRN_PMC8, r11
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
-       ld      r3, HSTATE_MMCR(r13)
-       ld      r4, HSTATE_MMCR + 8(r13)
-       ld      r5, HSTATE_MMCR + 16(r13)
-       mtspr   SPRN_MMCR1, r4
-       mtspr   SPRN_MMCRA, r5
-       mtspr   SPRN_MMCR0, r3
-       isync
-23:
-       /*
-        * For external and machine check interrupts, we need
-        * to call the Linux handler to process the interrupt.
-        * We do that by jumping to absolute address 0x500 for
-        * external interrupts, or the machine_check_fwnmi label
-        * for machine checks (since firmware might have patched
-        * the vector area at 0x200).  The [h]rfid at the end of the
-        * handler will return to the book3s_hv_interrupts.S code.
-        * For other interrupts we do the rfid to get back
-        * to the book3s_hv_interrupts.S code here.
-        */
-       ld      r8, HSTATE_VMHANDLER(r13)
-       ld      r7, HSTATE_HOST_MSR(r13)
-
-       cmpwi   cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
-       cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
-BEGIN_FTR_SECTION
-       beq     11f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-
-       /* RFI into the highmem handler, or branch to interrupt handler */
-       mfmsr   r6
-       li      r0, MSR_RI
-       andc    r6, r6, r0
-       mtmsrd  r6, 1                   /* Clear RI in MSR */
-       mtsrr0  r8
-       mtsrr1  r7
-       beqa    0x500                   /* external interrupt (PPC970) */
-       beq     cr1, 13f                /* machine check */
-       RFI
-
-       /* On POWER7, we have external interrupts set to use HSRR0/1 */
-11:    mtspr   SPRN_HSRR0, r8
-       mtspr   SPRN_HSRR1, r7
-       ba      0x500
-
-13:    b       machine_check_fwnmi
+       .rept   SLB_NUM_BOLTED
+       ld      r5,SLBSHADOW_SAVEAREA(r11)
+       ld      r6,SLBSHADOW_SAVEAREA+8(r11)
+       andis.  r7,r5,SLB_ESID_V@h
+       beq     1f
+       slbmte  r6,r5
+1:     addi    r11,r11,16
+       .endr
+       b       22b
 
 /*
  * Check whether an HDSI is an HPTE not found fault or something else.
@@ -1333,7 +1416,7 @@ fast_interrupt_c_return:
        stw     r8, VCPU_LAST_INST(r9)
 
        /* Unset guest mode. */
-       li      r0, KVM_GUEST_MODE_NONE
+       li      r0, KVM_GUEST_MODE_HOST_HV
        stb     r0, HSTATE_IN_GUEST(r13)
        b       guest_exit_cont
 
@@ -1701,67 +1784,70 @@ machine_check_realmode:
        rotldi  r11, r11, 63
        b       fast_interrupt_c_return
 
-secondary_too_late:
-       ld      r5,HSTATE_KVM_VCORE(r13)
-       HMT_LOW
-13:    lbz     r3,VCORE_IN_GUEST(r5)
-       cmpwi   r3,0
-       bne     13b
-       HMT_MEDIUM
-       ld      r11,PACA_SLBSHADOWPTR(r13)
-
-       .rept   SLB_NUM_BOLTED
-       ld      r5,SLBSHADOW_SAVEAREA(r11)
-       ld      r6,SLBSHADOW_SAVEAREA+8(r11)
-       andis.  r7,r5,SLB_ESID_V@h
-       beq     1f
-       slbmte  r6,r5
-1:     addi    r11,r11,16
-       .endr
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ *     0 if no interrupt is pending
+ *     1 if an interrupt is pending that needs to be handled by the host
+ *     -1 if there was a guest wakeup IPI (which has now been cleared)
+ */
+kvmppc_read_intr:
+       /* see if a host IPI is pending */
+       li      r3, 1
+       lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
+       bne     1f
 
-secondary_nap:
-       /* Clear our vcpu pointer so we don't come back in early */
-       li      r0, 0
-       std     r0, HSTATE_KVM_VCPU(r13)
-       lwsync
-       /* Clear any pending IPI - assume we're a secondary thread */
-       ld      r5, HSTATE_XICS_PHYS(r13)
+       /* Now read the interrupt from the ICP */
+       ld      r6, HSTATE_XICS_PHYS(r13)
        li      r7, XICS_XIRR
-       lwzcix  r3, r5, r7              /* ack any pending interrupt */
-       rlwinm. r0, r3, 0, 0xffffff     /* any pending? */
-       beq     37f
+       cmpdi   r6, 0
+       beq-    1f
+       lwzcix  r0, r6, r7
+       rlwinm. r3, r0, 0, 0xffffff
        sync
-       li      r0, 0xff
-       li      r6, XICS_MFRR
-       stbcix  r0, r5, r6              /* clear the IPI */
-       stwcix  r3, r5, r7              /* EOI it */
-37:    sync
+       beq     1f                      /* if nothing pending in the ICP */
 
-       /* increment the nap count and then go to nap mode */
-       ld      r4, HSTATE_KVM_VCORE(r13)
-       addi    r4, r4, VCORE_NAP_COUNT
-       lwsync                          /* make previous updates visible */
-51:    lwarx   r3, 0, r4
-       addi    r3, r3, 1
-       stwcx.  r3, 0, r4
-       bne     51b
+       /* We found something in the ICP...
+        *
+        * If it's not an IPI, stash it in the PACA and return to
+        * the host, we don't (yet) handle directing real external
+        * interrupts directly to the guest
+        */
+       cmpwi   r3, XICS_IPI            /* if there is, is it an IPI? */
+       li      r3, 1
+       bne     42f
 
-kvm_no_guest:
-       li      r0, KVM_HWTHREAD_IN_NAP
-       stb     r0, HSTATE_HWTHREAD_STATE(r13)
+       /* It's an IPI, clear the MFRR and EOI it */
+       li      r3, 0xff
+       li      r8, XICS_MFRR
+       stbcix  r3, r6, r8              /* clear the IPI */
+       stwcix  r0, r6, r7              /* EOI it */
+       sync
 
-       li      r3, LPCR_PECE0
-       mfspr   r4, SPRN_LPCR
-       rlwimi  r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
-       mtspr   SPRN_LPCR, r4
-       isync
-       std     r0, HSTATE_SCRATCH0(r13)
-       ptesync
-       ld      r0, HSTATE_SCRATCH0(r13)
-1:     cmpd    r0, r0
-       bne     1b
-       nap
-       b       .
+       /* We need to re-check host IPI now in case it got set in the
+        * meantime. If it's clear, we bounce the interrupt to the
+        * guest
+        */
+       lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
+       bne-    43f
+
+       /* OK, it's an IPI for us */
+       li      r3, -1
+1:     blr
+
+42:    /* It's not an IPI and it's for the host, stash it in the PACA
+        * before exit, it will be picked up by the host ICP driver
+        */
+       stw     r0, HSTATE_SAVED_XIRR(r13)
+       b       1b
+
+43:    /* We raced with the host, we need to resend that IPI, bummer */
+       li      r0, IPI_PRIORITY
+       stbcix  r0, r6, r8              /* set the IPI */
+       sync
+       b       1b
 
 /*
  * Save away FP, VMX and VSX registers.
@@ -1879,3 +1965,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
        lwz     r7,VCPU_VRSAVE(r4)
        mtspr   SPRN_VRSAVE,r7
        blr
+
+/*
+ * We come here if we get any exception or interrupt while we are
+ * executing host real mode code while in guest MMU context.
+ * For now just spin, but we should do something better.
+ */
+kvmppc_bad_host_intr:
+       b       .
index 17cfae5..f4dd041 100644 (file)
 
 #if defined(CONFIG_PPC_BOOK3S_64)
 #define FUNC(name)             GLUE(.,name)
+#define GET_SHADOW_VCPU(reg)    addi   reg, r13, PACA_SVCPU
+
 #elif defined(CONFIG_PPC_BOOK3S_32)
 #define FUNC(name)             name
+#define GET_SHADOW_VCPU(reg)   lwz     reg, (THREAD + THREAD_KVM_SVCPU)(r2)
+
 #endif /* CONFIG_PPC_BOOK3S_XX */
 
 #define VCPU_LOAD_NVGPRS(vcpu) \
@@ -87,8 +91,14 @@ kvm_start_entry:
        VCPU_LOAD_NVGPRS(r4)
 
 kvm_start_lightweight:
+       /* Copy registers into shadow vcpu so we can access them in real mode */
+       GET_SHADOW_VCPU(r3)
+       bl      FUNC(kvmppc_copy_to_svcpu)
+       nop
+       REST_GPR(4, r1)
 
 #ifdef CONFIG_PPC_BOOK3S_64
+       /* Get the dcbz32 flag */
        PPC_LL  r3, VCPU_HFLAGS(r4)
        rldicl  r3, r3, 0, 63           /* r3 &= 1 */
        stb     r3, HSTATE_RESTORE_HID5(r13)
@@ -111,9 +121,6 @@ kvm_start_lightweight:
  *
  */
 
-.global kvmppc_handler_highmem
-kvmppc_handler_highmem:
-
        /*
         * Register usage at this point:
         *
@@ -125,18 +132,31 @@ kvmppc_handler_highmem:
         *
         */
 
-       /* R7 = vcpu */
-       PPC_LL  r7, GPR4(r1)
+       /* Transfer reg values from shadow vcpu back to vcpu struct */
+       /* On 64-bit, interrupts are still off at this point */
+       PPC_LL  r3, GPR4(r1)            /* vcpu pointer */
+       GET_SHADOW_VCPU(r4)
+       bl      FUNC(kvmppc_copy_from_svcpu)
+       nop
 
 #ifdef CONFIG_PPC_BOOK3S_64
+       /* Re-enable interrupts */
+       ld      r3, HSTATE_HOST_MSR(r13)
+       ori     r3, r3, MSR_EE
+       MTMSR_EERI(r3)
+
        /*
         * Reload kernel SPRG3 value.
         * No need to save guest value as usermode can't modify SPRG3.
         */
        ld      r3, PACA_SPRG3(r13)
        mtspr   SPRN_SPRG3, r3
+
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
+       /* R7 = vcpu */
+       PPC_LL  r7, GPR4(r1)
+
        PPC_STL r14, VCPU_GPR(R14)(r7)
        PPC_STL r15, VCPU_GPR(R15)(r7)
        PPC_STL r16, VCPU_GPR(R16)(r7)
@@ -161,7 +181,7 @@ kvmppc_handler_highmem:
 
        /* Restore r3 (kvm_run) and r4 (vcpu) */
        REST_2GPRS(3, r1)
-       bl      FUNC(kvmppc_handle_exit)
+       bl      FUNC(kvmppc_handle_exit_pr)
 
        /* If RESUME_GUEST, get back in the loop */
        cmpwi   r3, RESUME_GUEST
index da8b13c..5a1ab12 100644 (file)
@@ -28,7 +28,7 @@
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 
-#include "trace.h"
+#include "trace_pr.h"
 
 #define PTE_SIZE       12
 
@@ -56,6 +56,14 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
                       HPTEG_HASH_BITS_VPTE_LONG);
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage)
+{
+       return hash_64((vpage & 0xffffffff0ULL) >> 4,
+                      HPTEG_HASH_BITS_VPTE_64K);
+}
+#endif
+
 void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
        u64 index;
@@ -83,6 +91,15 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
        hlist_add_head_rcu(&pte->list_vpte_long,
                           &vcpu3s->hpte_hash_vpte_long[index]);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+       /* Add to vPTE_64k list */
+       index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage);
+       hlist_add_head_rcu(&pte->list_vpte_64k,
+                          &vcpu3s->hpte_hash_vpte_64k[index]);
+#endif
+
+       vcpu3s->hpte_cache_count++;
+
        spin_unlock(&vcpu3s->mmu_lock);
 }
 
@@ -113,10 +130,13 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
        hlist_del_init_rcu(&pte->list_pte_long);
        hlist_del_init_rcu(&pte->list_vpte);
        hlist_del_init_rcu(&pte->list_vpte_long);
+#ifdef CONFIG_PPC_BOOK3S_64
+       hlist_del_init_rcu(&pte->list_vpte_64k);
+#endif
+       vcpu3s->hpte_cache_count--;
 
        spin_unlock(&vcpu3s->mmu_lock);
 
-       vcpu3s->hpte_cache_count--;
        call_rcu(&pte->rcu_head, free_pte_rcu);
 }
 
@@ -219,6 +239,29 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
        rcu_read_unlock();
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Flush with mask 0xffffffff0 */
+static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+       struct hlist_head *list;
+       struct hpte_cache *pte;
+       u64 vp_mask = 0xffffffff0ULL;
+
+       list = &vcpu3s->hpte_hash_vpte_64k[
+               kvmppc_mmu_hash_vpte_64k(guest_vp)];
+
+       rcu_read_lock();
+
+       /* Check the list for matching entries and invalidate */
+       hlist_for_each_entry_rcu(pte, list, list_vpte_64k)
+               if ((pte->pte.vpage & vp_mask) == guest_vp)
+                       invalidate_pte(vcpu, pte);
+
+       rcu_read_unlock();
+}
+#endif
+
 /* Flush with mask 0xffffff000 */
 static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
@@ -249,6 +292,11 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
        case 0xfffffffffULL:
                kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
                break;
+#ifdef CONFIG_PPC_BOOK3S_64
+       case 0xffffffff0ULL:
+               kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp);
+               break;
+#endif
        case 0xffffff000ULL:
                kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
                break;
@@ -285,15 +333,19 @@ struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
        struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        struct hpte_cache *pte;
 
-       pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
-       vcpu3s->hpte_cache_count++;
-
        if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)
                kvmppc_mmu_pte_flush_all(vcpu);
 
+       pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
+
        return pte;
 }
 
+void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte)
+{
+       kmem_cache_free(hpte_cache, pte);
+}
+
 void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
 {
        kvmppc_mmu_pte_flush(vcpu, 0, 0);
@@ -320,6 +372,10 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
                                  ARRAY_SIZE(vcpu3s->hpte_hash_vpte));
        kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long,
                                  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long));
+#ifdef CONFIG_PPC_BOOK3S_64
+       kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k,
+                                 ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k));
+#endif
 
        spin_lock_init(&vcpu3s->mmu_lock);
 
index c0b48f9..fe14ca3 100644 (file)
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
+#include <linux/module.h>
 
-#include "trace.h"
+#include "book3s.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace_pr.h"
 
 /* #define EXIT_DEBUG */
 /* #define DEBUG_EXT */
@@ -56,29 +60,25 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #define HW_PAGE_SIZE PAGE_SIZE
 #endif
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu)
 {
 #ifdef CONFIG_PPC_BOOK3S_64
        struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
        memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb));
-       memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
-              sizeof(get_paca()->shadow_vcpu));
        svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
        svcpu_put(svcpu);
 #endif
        vcpu->cpu = smp_processor_id();
 #ifdef CONFIG_PPC_BOOK3S_32
-       current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
+       current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu;
 #endif
 }
 
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_PPC_BOOK3S_64
        struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
        memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb));
-       memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
-              sizeof(get_paca()->shadow_vcpu));
        to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max;
        svcpu_put(svcpu);
 #endif
@@ -87,7 +87,61 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
        vcpu->cpu = -1;
 }
 
-int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
+/* Copy data needed by real-mode code from vcpu to shadow vcpu */
+void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
+                         struct kvm_vcpu *vcpu)
+{
+       svcpu->gpr[0] = vcpu->arch.gpr[0];
+       svcpu->gpr[1] = vcpu->arch.gpr[1];
+       svcpu->gpr[2] = vcpu->arch.gpr[2];
+       svcpu->gpr[3] = vcpu->arch.gpr[3];
+       svcpu->gpr[4] = vcpu->arch.gpr[4];
+       svcpu->gpr[5] = vcpu->arch.gpr[5];
+       svcpu->gpr[6] = vcpu->arch.gpr[6];
+       svcpu->gpr[7] = vcpu->arch.gpr[7];
+       svcpu->gpr[8] = vcpu->arch.gpr[8];
+       svcpu->gpr[9] = vcpu->arch.gpr[9];
+       svcpu->gpr[10] = vcpu->arch.gpr[10];
+       svcpu->gpr[11] = vcpu->arch.gpr[11];
+       svcpu->gpr[12] = vcpu->arch.gpr[12];
+       svcpu->gpr[13] = vcpu->arch.gpr[13];
+       svcpu->cr  = vcpu->arch.cr;
+       svcpu->xer = vcpu->arch.xer;
+       svcpu->ctr = vcpu->arch.ctr;
+       svcpu->lr  = vcpu->arch.lr;
+       svcpu->pc  = vcpu->arch.pc;
+}
+
+/* Copy data touched by real-mode code from shadow vcpu back to vcpu */
+void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
+                           struct kvmppc_book3s_shadow_vcpu *svcpu)
+{
+       vcpu->arch.gpr[0] = svcpu->gpr[0];
+       vcpu->arch.gpr[1] = svcpu->gpr[1];
+       vcpu->arch.gpr[2] = svcpu->gpr[2];
+       vcpu->arch.gpr[3] = svcpu->gpr[3];
+       vcpu->arch.gpr[4] = svcpu->gpr[4];
+       vcpu->arch.gpr[5] = svcpu->gpr[5];
+       vcpu->arch.gpr[6] = svcpu->gpr[6];
+       vcpu->arch.gpr[7] = svcpu->gpr[7];
+       vcpu->arch.gpr[8] = svcpu->gpr[8];
+       vcpu->arch.gpr[9] = svcpu->gpr[9];
+       vcpu->arch.gpr[10] = svcpu->gpr[10];
+       vcpu->arch.gpr[11] = svcpu->gpr[11];
+       vcpu->arch.gpr[12] = svcpu->gpr[12];
+       vcpu->arch.gpr[13] = svcpu->gpr[13];
+       vcpu->arch.cr  = svcpu->cr;
+       vcpu->arch.xer = svcpu->xer;
+       vcpu->arch.ctr = svcpu->ctr;
+       vcpu->arch.lr  = svcpu->lr;
+       vcpu->arch.pc  = svcpu->pc;
+       vcpu->arch.shadow_srr1 = svcpu->shadow_srr1;
+       vcpu->arch.fault_dar   = svcpu->fault_dar;
+       vcpu->arch.fault_dsisr = svcpu->fault_dsisr;
+       vcpu->arch.last_inst   = svcpu->last_inst;
+}
+
+static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
 {
        int r = 1; /* Indicate we want to get back into the guest */
 
@@ -100,44 +154,69 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
 }
 
 /************* MMU Notifiers *************/
+static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
+                            unsigned long end)
+{
+       long i;
+       struct kvm_vcpu *vcpu;
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+
+       slots = kvm_memslots(kvm);
+       kvm_for_each_memslot(memslot, slots) {
+               unsigned long hva_start, hva_end;
+               gfn_t gfn, gfn_end;
+
+               hva_start = max(start, memslot->userspace_addr);
+               hva_end = min(end, memslot->userspace_addr +
+                                       (memslot->npages << PAGE_SHIFT));
+               if (hva_start >= hva_end)
+                       continue;
+               /*
+                * {gfn(page) | page intersects with [hva_start, hva_end)} =
+                * {gfn, gfn+1, ..., gfn_end-1}.
+                */
+               gfn = hva_to_gfn_memslot(hva_start, memslot);
+               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+               kvm_for_each_vcpu(i, vcpu, kvm)
+                       kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT,
+                                             gfn_end << PAGE_SHIFT);
+       }
+}
 
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva)
 {
        trace_kvm_unmap_hva(hva);
 
-       /*
-        * Flush all shadow tlb entries everywhere. This is slow, but
-        * we are 100% sure that we catch the to be unmapped page
-        */
-       kvm_flush_remote_tlbs(kvm);
+       do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
 
        return 0;
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
+                                 unsigned long end)
 {
-       /* kvm_unmap_hva flushes everything anyways */
-       kvm_unmap_hva(kvm, start);
+       do_kvm_unmap_hva(kvm, start, end);
 
        return 0;
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+static int kvm_age_hva_pr(struct kvm *kvm, unsigned long hva)
 {
        /* XXX could be more clever ;) */
        return 0;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva)
 {
        /* XXX could be more clever ;) */
        return 0;
 }
 
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
        /* The page will get remapped properly on its next fault */
-       kvm_unmap_hva(kvm, hva);
+       do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
 }
 
 /*****************************************/
@@ -159,7 +238,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
        vcpu->arch.shadow_msr = smsr;
 }
 
-void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 {
        ulong old_msr = vcpu->arch.shared->msr;
 
@@ -219,7 +298,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
                kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 }
 
-void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
 {
        u32 host_pvr;
 
@@ -256,6 +335,23 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
        if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
                to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
 
+       /*
+        * If they're asking for POWER6 or later, set the flag
+        * indicating that we can do multiple large page sizes
+        * and 1TB segments.
+        * Also set the flag that indicates that tlbie has the large
+        * page bit in the RB operand instead of the instruction.
+        */
+       switch (PVR_VER(pvr)) {
+       case PVR_POWER6:
+       case PVR_POWER7:
+       case PVR_POWER7p:
+       case PVR_POWER8:
+               vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
+                       BOOK3S_HFLAG_NEW_TLBIE;
+               break;
+       }
+
 #ifdef CONFIG_PPC_BOOK3S_32
        /* 32 bit Book3S always has 32 byte dcbz */
        vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
@@ -334,6 +430,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            ulong eaddr, int vec)
 {
        bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
+       bool iswrite = false;
        int r = RESUME_GUEST;
        int relocated;
        int page_found = 0;
@@ -344,10 +441,12 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        u64 vsid;
 
        relocated = data ? dr : ir;
+       if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE))
+               iswrite = true;
 
        /* Resolve real address if translation turned on */
        if (relocated) {
-               page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
+               page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite);
        } else {
                pte.may_execute = true;
                pte.may_read = true;
@@ -355,6 +454,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                pte.raddr = eaddr & KVM_PAM;
                pte.eaddr = eaddr;
                pte.vpage = eaddr >> 12;
+               pte.page_size = MMU_PAGE_64K;
        }
 
        switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
@@ -388,22 +488,18 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        if (page_found == -ENOENT) {
                /* Page not found in guest PTE entries */
-               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
                vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr = svcpu->fault_dsisr;
+               vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr;
                vcpu->arch.shared->msr |=
-                       (svcpu->shadow_srr1 & 0x00000000f8000000ULL);
-               svcpu_put(svcpu);
+                       vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL;
                kvmppc_book3s_queue_irqprio(vcpu, vec);
        } else if (page_found == -EPERM) {
                /* Storage protection */
-               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
                vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE;
+               vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE;
                vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
                vcpu->arch.shared->msr |=
-                       svcpu->shadow_srr1 & 0x00000000f8000000ULL;
-               svcpu_put(svcpu);
+                       vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL;
                kvmppc_book3s_queue_irqprio(vcpu, vec);
        } else if (page_found == -EINVAL) {
                /* Page not found in guest SLB */
@@ -411,12 +507,20 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
        } else if (!is_mmio &&
                   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
+               if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) {
+                       /*
+                        * There is already a host HPTE there, presumably
+                        * a read-only one for a page the guest thinks
+                        * is writable, so get rid of it first.
+                        */
+                       kvmppc_mmu_unmap_page(vcpu, &pte);
+               }
                /* The guest's PTE is not mapped yet. Map on the host */
-               kvmppc_mmu_map_page(vcpu, &pte);
+               kvmppc_mmu_map_page(vcpu, &pte, iswrite);
                if (data)
                        vcpu->stat.sp_storage++;
                else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-                       (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
+                        (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
                        kvmppc_patch_dcbz(vcpu, &pte);
        } else {
                /* MMIO */
@@ -619,13 +723,15 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
 
        if (lost_ext & MSR_FP)
                kvmppc_load_up_fpu();
+#ifdef CONFIG_ALTIVEC
        if (lost_ext & MSR_VEC)
                kvmppc_load_up_altivec();
+#endif
        current->thread.regs->msr |= lost_ext;
 }
 
-int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                       unsigned int exit_nr)
+int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                         unsigned int exit_nr)
 {
        int r = RESUME_HOST;
        int s;
@@ -643,25 +749,32 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        switch (exit_nr) {
        case BOOK3S_INTERRUPT_INST_STORAGE:
        {
-               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-               ulong shadow_srr1 = svcpu->shadow_srr1;
+               ulong shadow_srr1 = vcpu->arch.shadow_srr1;
                vcpu->stat.pf_instruc++;
 
 #ifdef CONFIG_PPC_BOOK3S_32
                /* We set segments as unused segments when invalidating them. So
                 * treat the respective fault as segment fault. */
-               if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) {
-                       kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-                       r = RESUME_GUEST;
+               {
+                       struct kvmppc_book3s_shadow_vcpu *svcpu;
+                       u32 sr;
+
+                       svcpu = svcpu_get(vcpu);
+                       sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT];
                        svcpu_put(svcpu);
-                       break;
+                       if (sr == SR_INVALID) {
+                               kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+                               r = RESUME_GUEST;
+                               break;
+                       }
                }
 #endif
-               svcpu_put(svcpu);
 
                /* only care about PTEG not found errors, but leave NX alone */
                if (shadow_srr1 & 0x40000000) {
+                       int idx = srcu_read_lock(&vcpu->kvm->srcu);
                        r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
+                       srcu_read_unlock(&vcpu->kvm->srcu, idx);
                        vcpu->stat.sp_instruc++;
                } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
                          (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
@@ -682,25 +795,36 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        case BOOK3S_INTERRUPT_DATA_STORAGE:
        {
                ulong dar = kvmppc_get_fault_dar(vcpu);
-               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-               u32 fault_dsisr = svcpu->fault_dsisr;
+               u32 fault_dsisr = vcpu->arch.fault_dsisr;
                vcpu->stat.pf_storage++;
 
 #ifdef CONFIG_PPC_BOOK3S_32
                /* We set segments as unused segments when invalidating them. So
                 * treat the respective fault as segment fault. */
-               if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) {
-                       kvmppc_mmu_map_segment(vcpu, dar);
-                       r = RESUME_GUEST;
+               {
+                       struct kvmppc_book3s_shadow_vcpu *svcpu;
+                       u32 sr;
+
+                       svcpu = svcpu_get(vcpu);
+                       sr = svcpu->sr[dar >> SID_SHIFT];
                        svcpu_put(svcpu);
-                       break;
+                       if (sr == SR_INVALID) {
+                               kvmppc_mmu_map_segment(vcpu, dar);
+                               r = RESUME_GUEST;
+                               break;
+                       }
                }
 #endif
-               svcpu_put(svcpu);
 
-               /* The only case we need to handle is missing shadow PTEs */
-               if (fault_dsisr & DSISR_NOHPTE) {
+               /*
+                * We need to handle missing shadow PTEs, and
+                * protection faults due to us mapping a page read-only
+                * when the guest thinks it is writable.
+                */
+               if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) {
+                       int idx = srcu_read_lock(&vcpu->kvm->srcu);
                        r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
+                       srcu_read_unlock(&vcpu->kvm->srcu, idx);
                } else {
                        vcpu->arch.shared->dar = dar;
                        vcpu->arch.shared->dsisr = fault_dsisr;
@@ -743,13 +867,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
        {
                enum emulation_result er;
-               struct kvmppc_book3s_shadow_vcpu *svcpu;
                ulong flags;
 
 program_interrupt:
-               svcpu = svcpu_get(vcpu);
-               flags = svcpu->shadow_srr1 & 0x1f0000ull;
-               svcpu_put(svcpu);
+               flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
 
                if (vcpu->arch.shared->msr & MSR_PR) {
 #ifdef EXIT_DEBUG
@@ -798,7 +919,7 @@ program_interrupt:
                        ulong cmd = kvmppc_get_gpr(vcpu, 3);
                        int i;
 
-#ifdef CONFIG_KVM_BOOK3S_64_PR
+#ifdef CONFIG_PPC_BOOK3S_64
                        if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) {
                                r = RESUME_GUEST;
                                break;
@@ -881,9 +1002,7 @@ program_interrupt:
                break;
        default:
        {
-               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
-               ulong shadow_srr1 = svcpu->shadow_srr1;
-               svcpu_put(svcpu);
+               ulong shadow_srr1 = vcpu->arch.shadow_srr1;
                /* Ugh - bork here! What did we get? */
                printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
                        exit_nr, kvmppc_get_pc(vcpu), shadow_srr1);
@@ -920,8 +1039,8 @@ program_interrupt:
        return r;
 }
 
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu,
+                                           struct kvm_sregs *sregs)
 {
        struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        int i;
@@ -947,13 +1066,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu,
+                                           struct kvm_sregs *sregs)
 {
        struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
        int i;
 
-       kvmppc_set_pvr(vcpu, sregs->pvr);
+       kvmppc_set_pvr_pr(vcpu, sregs->pvr);
 
        vcpu3s->sdr1 = sregs->u.s.sdr1;
        if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
@@ -983,7 +1102,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
+static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
+                                union kvmppc_one_reg *val)
 {
        int r = 0;
 
@@ -1012,7 +1132,8 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
        return r;
 }
 
-int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
+static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
+                                union kvmppc_one_reg *val)
 {
        int r = 0;
 
@@ -1042,28 +1163,30 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
        return r;
 }
 
-int kvmppc_core_check_processor_compat(void)
-{
-       return 0;
-}
-
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
+                                                  unsigned int id)
 {
        struct kvmppc_vcpu_book3s *vcpu_book3s;
        struct kvm_vcpu *vcpu;
        int err = -ENOMEM;
        unsigned long p;
 
-       vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
-       if (!vcpu_book3s)
+       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       if (!vcpu)
                goto out;
 
-       vcpu_book3s->shadow_vcpu =
-               kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
-       if (!vcpu_book3s->shadow_vcpu)
+       vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
+       if (!vcpu_book3s)
                goto free_vcpu;
+       vcpu->arch.book3s = vcpu_book3s;
+
+#ifdef CONFIG_KVM_BOOK3S_32
+       vcpu->arch.shadow_vcpu =
+               kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL);
+       if (!vcpu->arch.shadow_vcpu)
+               goto free_vcpu3s;
+#endif
 
-       vcpu = &vcpu_book3s->vcpu;
        err = kvm_vcpu_init(vcpu, kvm, id);
        if (err)
                goto free_shadow_vcpu;
@@ -1076,13 +1199,19 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
        vcpu->arch.shared = (void *)(p + PAGE_SIZE - 4096);
 
 #ifdef CONFIG_PPC_BOOK3S_64
-       /* default to book3s_64 (970fx) */
+       /*
+        * Default to the same as the host if we're on sufficiently
+        * recent machine that we have 1TB segments;
+        * otherwise default to PPC970FX.
+        */
        vcpu->arch.pvr = 0x3C0301;
+       if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+               vcpu->arch.pvr = mfspr(SPRN_PVR);
 #else
        /* default to book3s_32 (750) */
        vcpu->arch.pvr = 0x84202;
 #endif
-       kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+       kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr);
        vcpu->arch.slb_nr = 64;
 
        vcpu->arch.shadow_msr = MSR_USER64;
@@ -1096,24 +1225,31 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 uninit_vcpu:
        kvm_vcpu_uninit(vcpu);
 free_shadow_vcpu:
-       kfree(vcpu_book3s->shadow_vcpu);
-free_vcpu:
+#ifdef CONFIG_KVM_BOOK3S_32
+       kfree(vcpu->arch.shadow_vcpu);
+free_vcpu3s:
+#endif
        vfree(vcpu_book3s);
+free_vcpu:
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
 out:
        return ERR_PTR(err);
 }
 
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 
        free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
        kvm_vcpu_uninit(vcpu);
-       kfree(vcpu_book3s->shadow_vcpu);
+#ifdef CONFIG_KVM_BOOK3S_32
+       kfree(vcpu->arch.shadow_vcpu);
+#endif
        vfree(vcpu_book3s);
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
-int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
        int ret;
        struct thread_fp_state fp;
@@ -1216,8 +1352,8 @@ out:
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-                                     struct kvm_dirty_log *log)
+static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
+                                        struct kvm_dirty_log *log)
 {
        struct kvm_memory_slot *memslot;
        struct kvm_vcpu *vcpu;
@@ -1252,67 +1388,100 @@ out:
        return r;
 }
 
-#ifdef CONFIG_PPC64
-int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
+static void kvmppc_core_flush_memslot_pr(struct kvm *kvm,
+                                        struct kvm_memory_slot *memslot)
 {
-       info->flags = KVM_PPC_1T_SEGMENTS;
-
-       /* SLB is always 64 entries */
-       info->slb_size = 64;
-
-       /* Standard 4k base page size segment */
-       info->sps[0].page_shift = 12;
-       info->sps[0].slb_enc = 0;
-       info->sps[0].enc[0].page_shift = 12;
-       info->sps[0].enc[0].pte_enc = 0;
-
-       /* Standard 16M large page size segment */
-       info->sps[1].page_shift = 24;
-       info->sps[1].slb_enc = SLB_VSID_L;
-       info->sps[1].enc[0].page_shift = 24;
-       info->sps[1].enc[0].pte_enc = 0;
+       return;
+}
 
+static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot,
+                                       struct kvm_userspace_memory_region *mem)
+{
        return 0;
 }
-#endif /* CONFIG_PPC64 */
 
-void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
-                             struct kvm_memory_slot *dont)
+static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_memory_slot *old)
 {
+       return;
 }
 
-int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
-                              unsigned long npages)
+static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free,
+                                       struct kvm_memory_slot *dont)
 {
-       return 0;
+       return;
 }
 
-int kvmppc_core_prepare_memory_region(struct kvm *kvm,
-                                     struct kvm_memory_slot *memslot,
-                                     struct kvm_userspace_memory_region *mem)
+static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot,
+                                        unsigned long npages)
 {
        return 0;
 }
 
-void kvmppc_core_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
-                               const struct kvm_memory_slot *old)
+
+#ifdef CONFIG_PPC64
+static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm,
+                                        struct kvm_ppc_smmu_info *info)
 {
-}
+       long int i;
+       struct kvm_vcpu *vcpu;
+
+       info->flags = 0;
+
+       /* SLB is always 64 entries */
+       info->slb_size = 64;
+
+       /* Standard 4k base page size segment */
+       info->sps[0].page_shift = 12;
+       info->sps[0].slb_enc = 0;
+       info->sps[0].enc[0].page_shift = 12;
+       info->sps[0].enc[0].pte_enc = 0;
+
+       /*
+        * 64k large page size.
+        * We only want to put this in if the CPUs we're emulating
+        * support it, but unfortunately we don't have a vcpu easily
+        * to hand here to test.  Just pick the first vcpu, and if
+        * that doesn't exist yet, report the minimum capability,
+        * i.e., no 64k pages.
+        * 1T segment support goes along with 64k pages.
+        */
+       i = 1;
+       vcpu = kvm_get_vcpu(kvm, 0);
+       if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) {
+               info->flags = KVM_PPC_1T_SEGMENTS;
+               info->sps[i].page_shift = 16;
+               info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01;
+               info->sps[i].enc[0].page_shift = 16;
+               info->sps[i].enc[0].pte_enc = 1;
+               ++i;
+       }
+
+       /* Standard 16M large page size segment */
+       info->sps[i].page_shift = 24;
+       info->sps[i].slb_enc = SLB_VSID_L;
+       info->sps[i].enc[0].page_shift = 24;
+       info->sps[i].enc[0].pte_enc = 0;
 
-void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+       return 0;
+}
+#else
+static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm,
+                                        struct kvm_ppc_smmu_info *info)
 {
+       /* We should not get called */
+       BUG();
 }
+#endif /* CONFIG_PPC64 */
 
 static unsigned int kvm_global_user_count = 0;
 static DEFINE_SPINLOCK(kvm_global_user_count_lock);
 
-int kvmppc_core_init_vm(struct kvm *kvm)
+static int kvmppc_core_init_vm_pr(struct kvm *kvm)
 {
-#ifdef CONFIG_PPC64
-       INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
-       INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
-#endif
+       mutex_init(&kvm->arch.hpt_mutex);
 
        if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
                spin_lock(&kvm_global_user_count_lock);
@@ -1323,7 +1492,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
        return 0;
 }
 
-void kvmppc_core_destroy_vm(struct kvm *kvm)
+static void kvmppc_core_destroy_vm_pr(struct kvm *kvm)
 {
 #ifdef CONFIG_PPC64
        WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
@@ -1338,26 +1507,81 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
        }
 }
 
-static int kvmppc_book3s_init(void)
+static int kvmppc_core_check_processor_compat_pr(void)
 {
-       int r;
+       /* we are always compatible */
+       return 0;
+}
 
-       r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
-                    THIS_MODULE);
+static long kvm_arch_vm_ioctl_pr(struct file *filp,
+                                unsigned int ioctl, unsigned long arg)
+{
+       return -ENOTTY;
+}
 
-       if (r)
+static struct kvmppc_ops kvm_ops_pr = {
+       .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr,
+       .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr,
+       .get_one_reg = kvmppc_get_one_reg_pr,
+       .set_one_reg = kvmppc_set_one_reg_pr,
+       .vcpu_load   = kvmppc_core_vcpu_load_pr,
+       .vcpu_put    = kvmppc_core_vcpu_put_pr,
+       .set_msr     = kvmppc_set_msr_pr,
+       .vcpu_run    = kvmppc_vcpu_run_pr,
+       .vcpu_create = kvmppc_core_vcpu_create_pr,
+       .vcpu_free   = kvmppc_core_vcpu_free_pr,
+       .check_requests = kvmppc_core_check_requests_pr,
+       .get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr,
+       .flush_memslot = kvmppc_core_flush_memslot_pr,
+       .prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
+       .commit_memory_region = kvmppc_core_commit_memory_region_pr,
+       .unmap_hva = kvm_unmap_hva_pr,
+       .unmap_hva_range = kvm_unmap_hva_range_pr,
+       .age_hva  = kvm_age_hva_pr,
+       .test_age_hva = kvm_test_age_hva_pr,
+       .set_spte_hva = kvm_set_spte_hva_pr,
+       .mmu_destroy  = kvmppc_mmu_destroy_pr,
+       .free_memslot = kvmppc_core_free_memslot_pr,
+       .create_memslot = kvmppc_core_create_memslot_pr,
+       .init_vm = kvmppc_core_init_vm_pr,
+       .destroy_vm = kvmppc_core_destroy_vm_pr,
+       .get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr,
+       .emulate_op = kvmppc_core_emulate_op_pr,
+       .emulate_mtspr = kvmppc_core_emulate_mtspr_pr,
+       .emulate_mfspr = kvmppc_core_emulate_mfspr_pr,
+       .fast_vcpu_kick = kvm_vcpu_kick,
+       .arch_vm_ioctl  = kvm_arch_vm_ioctl_pr,
+};
+
+
+int kvmppc_book3s_init_pr(void)
+{
+       int r;
+
+       r = kvmppc_core_check_processor_compat_pr();
+       if (r < 0)
                return r;
 
-       r = kvmppc_mmu_hpte_sysinit();
+       kvm_ops_pr.owner = THIS_MODULE;
+       kvmppc_pr_ops = &kvm_ops_pr;
 
+       r = kvmppc_mmu_hpte_sysinit();
        return r;
 }
 
-static void kvmppc_book3s_exit(void)
+void kvmppc_book3s_exit_pr(void)
 {
+       kvmppc_pr_ops = NULL;
        kvmppc_mmu_hpte_sysexit();
-       kvm_exit();
 }
 
-module_init(kvmppc_book3s_init);
-module_exit(kvmppc_book3s_exit);
+/*
+ * We only support separate modules for book3s 64
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+
+module_init(kvmppc_book3s_init_pr);
+module_exit(kvmppc_book3s_exit_pr);
+
+MODULE_LICENSE("GPL");
+#endif
index da0e0bc..5efa97b 100644 (file)
@@ -21,6 +21,8 @@
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 
+#define HPTE_SIZE      16              /* bytes per HPT entry */
+
 static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index)
 {
        struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
@@ -40,32 +42,41 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
        long pte_index = kvmppc_get_gpr(vcpu, 5);
        unsigned long pteg[2 * 8];
        unsigned long pteg_addr, i, *hpte;
+       long int ret;
 
+       i = pte_index & 7;
        pte_index &= ~7UL;
        pteg_addr = get_pteg_addr(vcpu, pte_index);
 
+       mutex_lock(&vcpu->kvm->arch.hpt_mutex);
        copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg));
        hpte = pteg;
 
+       ret = H_PTEG_FULL;
        if (likely((flags & H_EXACT) == 0)) {
-               pte_index &= ~7UL;
                for (i = 0; ; ++i) {
                        if (i == 8)
-                               return H_PTEG_FULL;
+                               goto done;
                        if ((*hpte & HPTE_V_VALID) == 0)
                                break;
                        hpte += 2;
                }
        } else {
-               i = kvmppc_get_gpr(vcpu, 5) & 7UL;
                hpte += i * 2;
+               if (*hpte & HPTE_V_VALID)
+                       goto done;
        }
 
        hpte[0] = kvmppc_get_gpr(vcpu, 6);
        hpte[1] = kvmppc_get_gpr(vcpu, 7);
-       copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg));
-       kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+       pteg_addr += i * HPTE_SIZE;
+       copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE);
        kvmppc_set_gpr(vcpu, 4, pte_index | i);
+       ret = H_SUCCESS;
+
+ done:
+       mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+       kvmppc_set_gpr(vcpu, 3, ret);
 
        return EMULATE_DONE;
 }
@@ -77,26 +88,31 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
        unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
        unsigned long v = 0, pteg, rb;
        unsigned long pte[2];
+       long int ret;
 
        pteg = get_pteg_addr(vcpu, pte_index);
+       mutex_lock(&vcpu->kvm->arch.hpt_mutex);
        copy_from_user(pte, (void __user *)pteg, sizeof(pte));
 
+       ret = H_NOT_FOUND;
        if ((pte[0] & HPTE_V_VALID) == 0 ||
            ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) ||
-           ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) {
-               kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
-               return EMULATE_DONE;
-       }
+           ((flags & H_ANDCOND) && (pte[0] & avpn) != 0))
+               goto done;
 
        copy_to_user((void __user *)pteg, &v, sizeof(v));
 
        rb = compute_tlbie_rb(pte[0], pte[1], pte_index);
        vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
 
-       kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+       ret = H_SUCCESS;
        kvmppc_set_gpr(vcpu, 4, pte[0]);
        kvmppc_set_gpr(vcpu, 5, pte[1]);
 
+ done:
+       mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+       kvmppc_set_gpr(vcpu, 3, ret);
+
        return EMULATE_DONE;
 }
 
@@ -124,6 +140,7 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu)
        int paramnr = 4;
        int ret = H_SUCCESS;
 
+       mutex_lock(&vcpu->kvm->arch.hpt_mutex);
        for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) {
                unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i));
                unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1);
@@ -172,6 +189,7 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu)
                }
                kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh);
        }
+       mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
        kvmppc_set_gpr(vcpu, 3, ret);
 
        return EMULATE_DONE;
@@ -184,15 +202,16 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
        unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
        unsigned long rb, pteg, r, v;
        unsigned long pte[2];
+       long int ret;
 
        pteg = get_pteg_addr(vcpu, pte_index);
+       mutex_lock(&vcpu->kvm->arch.hpt_mutex);
        copy_from_user(pte, (void __user *)pteg, sizeof(pte));
 
+       ret = H_NOT_FOUND;
        if ((pte[0] & HPTE_V_VALID) == 0 ||
-           ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) {
-               kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
-               return EMULATE_DONE;
-       }
+           ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn))
+               goto done;
 
        v = pte[0];
        r = pte[1];
@@ -207,8 +226,11 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
        rb = compute_tlbie_rb(v, r, pte_index);
        vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
        copy_to_user((void __user *)pteg, pte, sizeof(pte));
+       ret = H_SUCCESS;
 
-       kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+ done:
+       mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+       kvmppc_set_gpr(vcpu, 3, ret);
 
        return EMULATE_DONE;
 }
index 8f7633e..a38c4c9 100644 (file)
 
 #define FUNC(name)             GLUE(.,name)
 
-       .globl  kvmppc_skip_interrupt
-kvmppc_skip_interrupt:
-       /*
-        * Here all GPRs are unchanged from when the interrupt happened
-        * except for r13, which is saved in SPRG_SCRATCH0.
-        */
-       mfspr   r13, SPRN_SRR0
-       addi    r13, r13, 4
-       mtspr   SPRN_SRR0, r13
-       GET_SCRATCH0(r13)
-       rfid
-       b       .
-
-       .globl  kvmppc_skip_Hinterrupt
-kvmppc_skip_Hinterrupt:
-       /*
-        * Here all GPRs are unchanged from when the interrupt happened
-        * except for r13, which is saved in SPRG_SCRATCH0.
-        */
-       mfspr   r13, SPRN_HSRR0
-       addi    r13, r13, 4
-       mtspr   SPRN_HSRR0, r13
-       GET_SCRATCH0(r13)
-       hrfid
-       b       .
-
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
 #define FUNC(name)             name
@@ -179,11 +153,15 @@ _GLOBAL(kvmppc_entry_trampoline)
 
        li      r6, MSR_IR | MSR_DR
        andc    r6, r5, r6      /* Clear DR and IR in MSR value */
+#ifdef CONFIG_PPC_BOOK3S_32
        /*
         * Set EE in HOST_MSR so that it's enabled when we get into our
-        * C exit handler function
+        * C exit handler function.  On 64-bit we delay enabling
+        * interrupts until we have finished transferring stuff
+        * to or from the PACA.
         */
        ori     r5, r5, MSR_EE
+#endif
        mtsrr0  r7
        mtsrr1  r6
        RFI
index 3219ba8..cf95cde 100644 (file)
@@ -260,6 +260,7 @@ fail:
         */
        return rc;
 }
+EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall);
 
 void kvmppc_rtas_tokens_free(struct kvm *kvm)
 {
index 1abe478..bc50c97 100644 (file)
@@ -161,8 +161,8 @@ kvmppc_handler_trampoline_enter_end:
 .global kvmppc_handler_trampoline_exit
 kvmppc_handler_trampoline_exit:
 
-.global kvmppc_interrupt
-kvmppc_interrupt:
+.global kvmppc_interrupt_pr
+kvmppc_interrupt_pr:
 
        /* Register usage at this point:
         *
index a3a5cb8..02a17dc 100644 (file)
@@ -818,7 +818,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
        }
 
        /* Check for real mode returning too hard */
-       if (xics->real_mode)
+       if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm))
                return kvmppc_xics_rm_complete(vcpu, req);
 
        switch (req) {
@@ -840,6 +840,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
 
        return rc;
 }
+EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
 
 
 /* -- Initialisation code etc. -- */
@@ -1250,13 +1251,13 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 
        xics_debugfs_init(xics);
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        if (cpu_has_feature(CPU_FTR_ARCH_206)) {
                /* Enable real mode support */
                xics->real_mode = ENABLE_REALMODE;
                xics->real_mode_dbg = DEBUG_REALMODE;
        }
-#endif /* CONFIG_KVM_BOOK3S_64_HV */
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
        return 0;
 }
index 5133199..53e65a2 100644 (file)
@@ -40,7 +40,9 @@
 
 #include "timing.h"
 #include "booke.h"
-#include "trace.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace_booke.h"
 
 unsigned long kvmppc_booke_handlers;
 
@@ -133,6 +135,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu)
+{
+       /* Synchronize guest's desire to get debug interrupts into shadow MSR */
+#ifndef CONFIG_KVM_BOOKE_HV
+       vcpu->arch.shadow_msr &= ~MSR_DE;
+       vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE;
+#endif
+
+       /* Force enable debug interrupts when user space wants to debug */
+       if (vcpu->guest_debug) {
+#ifdef CONFIG_KVM_BOOKE_HV
+               /*
+                * Since there is no shadow MSR, sync MSR_DE into the guest
+                * visible MSR.
+                */
+               vcpu->arch.shared->msr |= MSR_DE;
+#else
+               vcpu->arch.shadow_msr |= MSR_DE;
+               vcpu->arch.shared->msr &= ~MSR_DE;
+#endif
+       }
+}
+
 /*
  * Helper function for "full" MSR writes.  No need to call this if only
  * EE/CE/ME/DE/RI are changing.
@@ -150,6 +175,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
        kvmppc_mmu_msr_notify(vcpu, old_msr);
        kvmppc_vcpu_sync_spe(vcpu);
        kvmppc_vcpu_sync_fpu(vcpu);
+       kvmppc_vcpu_sync_debug(vcpu);
 }
 
 static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
@@ -655,6 +681,7 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
        int ret, s;
+       struct thread_struct thread;
 #ifdef CONFIG_PPC_FPU
        struct thread_fp_state fp;
        int fpexc_mode;
@@ -695,6 +722,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        kvmppc_load_guest_fp(vcpu);
 #endif
 
+       /* Switch to guest debug context */
+       thread.debug = vcpu->arch.shadow_dbg_reg;
+       switch_booke_debug_regs(&thread);
+       thread.debug = current->thread.debug;
+       current->thread.debug = vcpu->arch.shadow_dbg_reg;
+
        kvmppc_fix_ee_before_entry();
 
        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
@@ -702,6 +735,10 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        /* No need for kvm_guest_exit. It's done in handle_exit.
           We also get here with interrupts enabled. */
 
+       /* Switch back to user space debug context */
+       switch_booke_debug_regs(&thread);
+       current->thread.debug = thread.debug;
+
 #ifdef CONFIG_PPC_FPU
        kvmppc_save_guest_fp(vcpu);
 
@@ -757,6 +794,30 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
        }
 }
 
+static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+       struct debug_reg *dbg_reg = &(vcpu->arch.shadow_dbg_reg);
+       u32 dbsr = vcpu->arch.dbsr;
+
+       run->debug.arch.status = 0;
+       run->debug.arch.address = vcpu->arch.pc;
+
+       if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) {
+               run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT;
+       } else {
+               if (dbsr & (DBSR_DAC1W | DBSR_DAC2W))
+                       run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE;
+               else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R))
+                       run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ;
+               if (dbsr & (DBSR_DAC1R | DBSR_DAC1W))
+                       run->debug.arch.address = dbg_reg->dac1;
+               else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W))
+                       run->debug.arch.address = dbg_reg->dac2;
+       }
+
+       return RESUME_HOST;
+}
+
 static void kvmppc_fill_pt_regs(struct pt_regs *regs)
 {
        ulong r1, ip, msr, lr;
@@ -817,6 +878,11 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
        case BOOKE_INTERRUPT_CRITICAL:
                unknown_exception(&regs);
                break;
+       case BOOKE_INTERRUPT_DEBUG:
+               /* Save DBSR before preemption is enabled */
+               vcpu->arch.dbsr = mfspr(SPRN_DBSR);
+               kvmppc_clear_dbsr();
+               break;
        }
 }
 
@@ -1134,18 +1200,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        }
 
        case BOOKE_INTERRUPT_DEBUG: {
-               u32 dbsr;
-
-               vcpu->arch.pc = mfspr(SPRN_CSRR0);
-
-               /* clear IAC events in DBSR register */
-               dbsr = mfspr(SPRN_DBSR);
-               dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4;
-               mtspr(SPRN_DBSR, dbsr);
-
-               run->exit_reason = KVM_EXIT_DEBUG;
+               r = kvmppc_handle_debug(run, vcpu);
+               if (r == RESUME_HOST)
+                       run->exit_reason = KVM_EXIT_DEBUG;
                kvmppc_account_exit(vcpu, DEBUG_EXITS);
-               r = RESUME_HOST;
                break;
        }
 
@@ -1196,7 +1254,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        kvmppc_set_msr(vcpu, 0);
 
 #ifndef CONFIG_KVM_BOOKE_HV
-       vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
+       vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS;
        vcpu->arch.shadow_pid = 1;
        vcpu->arch.shared->msr = 0;
 #endif
@@ -1358,7 +1416,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
        sregs->u.e.features |= KVM_SREGS_E_IVOR;
 
@@ -1378,6 +1436,7 @@ void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
        sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
        sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+       return 0;
 }
 
 int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
@@ -1412,8 +1471,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
        get_sregs_base(vcpu, sregs);
        get_sregs_arch206(vcpu, sregs);
-       kvmppc_core_get_sregs(vcpu, sregs);
-       return 0;
+       return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -1432,7 +1490,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        if (ret < 0)
                return ret;
 
-       return kvmppc_core_set_sregs(vcpu, sregs);
+       return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
 }
 
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
@@ -1440,7 +1498,6 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
        int r = 0;
        union kvmppc_one_reg val;
        int size;
-       long int i;
 
        size = one_reg_size(reg->id);
        if (size > sizeof(val))
@@ -1448,16 +1505,24 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 
        switch (reg->id) {
        case KVM_REG_PPC_IAC1:
+               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac1);
+               break;
        case KVM_REG_PPC_IAC2:
+               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac2);
+               break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
        case KVM_REG_PPC_IAC3:
+               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac3);
+               break;
        case KVM_REG_PPC_IAC4:
-               i = reg->id - KVM_REG_PPC_IAC1;
-               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]);
+               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac4);
                break;
+#endif
        case KVM_REG_PPC_DAC1:
+               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac1);
+               break;
        case KVM_REG_PPC_DAC2:
-               i = reg->id - KVM_REG_PPC_DAC1;
-               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]);
+               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac2);
                break;
        case KVM_REG_PPC_EPR: {
                u32 epr = get_guest_epr(vcpu);
@@ -1476,10 +1541,13 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                val = get_reg_val(reg->id, vcpu->arch.tsr);
                break;
        case KVM_REG_PPC_DEBUG_INST:
-               val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV);
+               val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV_DEBUG);
+               break;
+       case KVM_REG_PPC_VRSAVE:
+               val = get_reg_val(reg->id, vcpu->arch.vrsave);
                break;
        default:
-               r = kvmppc_get_one_reg(vcpu, reg->id, &val);
+               r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val);
                break;
        }
 
@@ -1497,7 +1565,6 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
        int r = 0;
        union kvmppc_one_reg val;
        int size;
-       long int i;
 
        size = one_reg_size(reg->id);
        if (size > sizeof(val))
@@ -1508,16 +1575,24 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 
        switch (reg->id) {
        case KVM_REG_PPC_IAC1:
+               vcpu->arch.dbg_reg.iac1 = set_reg_val(reg->id, val);
+               break;
        case KVM_REG_PPC_IAC2:
+               vcpu->arch.dbg_reg.iac2 = set_reg_val(reg->id, val);
+               break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
        case KVM_REG_PPC_IAC3:
+               vcpu->arch.dbg_reg.iac3 = set_reg_val(reg->id, val);
+               break;
        case KVM_REG_PPC_IAC4:
-               i = reg->id - KVM_REG_PPC_IAC1;
-               vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val);
+               vcpu->arch.dbg_reg.iac4 = set_reg_val(reg->id, val);
                break;
+#endif
        case KVM_REG_PPC_DAC1:
+               vcpu->arch.dbg_reg.dac1 = set_reg_val(reg->id, val);
+               break;
        case KVM_REG_PPC_DAC2:
-               i = reg->id - KVM_REG_PPC_DAC1;
-               vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val);
+               vcpu->arch.dbg_reg.dac2 = set_reg_val(reg->id, val);
                break;
        case KVM_REG_PPC_EPR: {
                u32 new_epr = set_reg_val(reg->id, val);
@@ -1551,20 +1626,17 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                kvmppc_set_tcr(vcpu, tcr);
                break;
        }
+       case KVM_REG_PPC_VRSAVE:
+               vcpu->arch.vrsave = set_reg_val(reg->id, val);
+               break;
        default:
-               r = kvmppc_set_one_reg(vcpu, reg->id, &val);
+               r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val);
                break;
        }
 
        return r;
 }
 
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
-                                        struct kvm_guest_debug *dbg)
-{
-       return -EINVAL;
-}
-
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        return -ENOTSUPP;
@@ -1589,12 +1661,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
        return -ENOTSUPP;
 }
 
-void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                              struct kvm_memory_slot *dont)
 {
 }
 
-int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
                               unsigned long npages)
 {
        return 0;
@@ -1670,6 +1742,157 @@ void kvmppc_decrementer_func(unsigned long data)
        kvmppc_set_tsr_bits(vcpu, TSR_DIS);
 }
 
+static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg,
+                                      uint64_t addr, int index)
+{
+       switch (index) {
+       case 0:
+               dbg_reg->dbcr0 |= DBCR0_IAC1;
+               dbg_reg->iac1 = addr;
+               break;
+       case 1:
+               dbg_reg->dbcr0 |= DBCR0_IAC2;
+               dbg_reg->iac2 = addr;
+               break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+       case 2:
+               dbg_reg->dbcr0 |= DBCR0_IAC3;
+               dbg_reg->iac3 = addr;
+               break;
+       case 3:
+               dbg_reg->dbcr0 |= DBCR0_IAC4;
+               dbg_reg->iac4 = addr;
+               break;
+#endif
+       default:
+               return -EINVAL;
+       }
+
+       dbg_reg->dbcr0 |= DBCR0_IDM;
+       return 0;
+}
+
+static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr,
+                                      int type, int index)
+{
+       switch (index) {
+       case 0:
+               if (type & KVMPPC_DEBUG_WATCH_READ)
+                       dbg_reg->dbcr0 |= DBCR0_DAC1R;
+               if (type & KVMPPC_DEBUG_WATCH_WRITE)
+                       dbg_reg->dbcr0 |= DBCR0_DAC1W;
+               dbg_reg->dac1 = addr;
+               break;
+       case 1:
+               if (type & KVMPPC_DEBUG_WATCH_READ)
+                       dbg_reg->dbcr0 |= DBCR0_DAC2R;
+               if (type & KVMPPC_DEBUG_WATCH_WRITE)
+                       dbg_reg->dbcr0 |= DBCR0_DAC2W;
+               dbg_reg->dac2 = addr;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       dbg_reg->dbcr0 |= DBCR0_IDM;
+       return 0;
+}
+void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap, bool set)
+{
+       /* XXX: Add similar MSR protection for BookE-PR */
+#ifdef CONFIG_KVM_BOOKE_HV
+       BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP));
+       if (set) {
+               if (prot_bitmap & MSR_UCLE)
+                       vcpu->arch.shadow_msrp |= MSRP_UCLEP;
+               if (prot_bitmap & MSR_DE)
+                       vcpu->arch.shadow_msrp |= MSRP_DEP;
+               if (prot_bitmap & MSR_PMM)
+                       vcpu->arch.shadow_msrp |= MSRP_PMMP;
+       } else {
+               if (prot_bitmap & MSR_UCLE)
+                       vcpu->arch.shadow_msrp &= ~MSRP_UCLEP;
+               if (prot_bitmap & MSR_DE)
+                       vcpu->arch.shadow_msrp &= ~MSRP_DEP;
+               if (prot_bitmap & MSR_PMM)
+                       vcpu->arch.shadow_msrp &= ~MSRP_PMMP;
+       }
+#endif
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                        struct kvm_guest_debug *dbg)
+{
+       struct debug_reg *dbg_reg;
+       int n, b = 0, w = 0;
+
+       if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
+               vcpu->arch.shadow_dbg_reg.dbcr0 = 0;
+               vcpu->guest_debug = 0;
+               kvm_guest_protect_msr(vcpu, MSR_DE, false);
+               return 0;
+       }
+
+       kvm_guest_protect_msr(vcpu, MSR_DE, true);
+       vcpu->guest_debug = dbg->control;
+       vcpu->arch.shadow_dbg_reg.dbcr0 = 0;
+       /* Set DBCR0_EDM in guest visible DBCR0 register. */
+       vcpu->arch.dbg_reg.dbcr0 = DBCR0_EDM;
+
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+               vcpu->arch.shadow_dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC;
+
+       /* Code below handles only HW breakpoints */
+       dbg_reg = &(vcpu->arch.shadow_dbg_reg);
+
+#ifdef CONFIG_KVM_BOOKE_HV
+       /*
+        * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1
+        * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0
+        */
+       dbg_reg->dbcr1 = 0;
+       dbg_reg->dbcr2 = 0;
+#else
+       /*
+        * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1
+        * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR
+        * is set.
+        */
+       dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US |
+                         DBCR1_IAC4US;
+       dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US;
+#endif
+
+       if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+               return 0;
+
+       for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) {
+               uint64_t addr = dbg->arch.bp[n].addr;
+               uint32_t type = dbg->arch.bp[n].type;
+
+               if (type == KVMPPC_DEBUG_NONE)
+                       continue;
+
+               if (type & !(KVMPPC_DEBUG_WATCH_READ |
+                            KVMPPC_DEBUG_WATCH_WRITE |
+                            KVMPPC_DEBUG_BREAKPOINT))
+                       return -EINVAL;
+
+               if (type & KVMPPC_DEBUG_BREAKPOINT) {
+                       /* Setting H/W breakpoint */
+                       if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++))
+                               return -EINVAL;
+               } else {
+                       /* Setting H/W watchpoint */
+                       if (kvmppc_booke_add_watchpoint(dbg_reg, addr,
+                                                       type, w++))
+                               return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        vcpu->cpu = smp_processor_id();
@@ -1680,6 +1903,44 @@ void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
 {
        current->thread.kvm_vcpu = NULL;
        vcpu->cpu = -1;
+
+       /* Clear pending debug event in DBSR */
+       kvmppc_clear_dbsr();
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+       vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+       return kvm->arch.kvm_ops->init_vm(kvm);
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+       return kvm->arch.kvm_ops->vcpu_create(kvm, id);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+       vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+       kvm->arch.kvm_ops->destroy_vm(kvm);
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);
 }
 
 int __init kvmppc_booke_init(void)
index 5fd1ba6..09bfd9b 100644 (file)
@@ -99,6 +99,30 @@ enum int_class {
 
 void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type);
 
+extern void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                     unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn,
+                                        ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn,
+                                        ulong *spr_val);
+extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_emulate_op_e500(struct kvm_run *run,
+                                      struct kvm_vcpu *vcpu,
+                                      unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn,
+                                         ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn,
+                                         ulong *spr_val);
+extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_emulate_op_e500(struct kvm_run *run,
+                                      struct kvm_vcpu *vcpu,
+                                      unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn,
+                                         ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn,
+                                         ulong *spr_val);
+
 /*
  * Load up guest vcpu FP state if it's needed.
  * It also set the MSR_FP in thread so that host know
@@ -129,4 +153,9 @@ static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu)
                giveup_fpu(current);
 #endif
 }
+
+static inline void kvmppc_clear_dbsr(void)
+{
+       mtspr(SPRN_DBSR, mfspr(SPRN_DBSR));
+}
 #endif /* __KVM_BOOKE_H__ */
index ce6b73c..497b142 100644 (file)
@@ -305,7 +305,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 {
 }
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu)
 {
        kvmppc_booke_vcpu_load(vcpu, cpu);
 
@@ -313,7 +313,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
 }
 
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_SPE
        if (vcpu->arch.shadow_msr & MSR_SPE)
@@ -367,7 +367,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu,
+                                     struct kvm_sregs *sregs)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
@@ -388,9 +389,11 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 
        kvmppc_get_sregs_ivor(vcpu, sregs);
        kvmppc_get_sregs_e500_tlb(vcpu, sregs);
+       return 0;
 }
 
-int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu,
+                                     struct kvm_sregs *sregs)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
        int ret;
@@ -425,21 +428,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
-int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
-                       union kvmppc_one_reg *val)
+static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id,
+                                  union kvmppc_one_reg *val)
 {
        int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
        return r;
 }
 
-int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
-                      union kvmppc_one_reg *val)
+static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id,
+                                  union kvmppc_one_reg *val)
 {
        int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
        return r;
 }
 
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm,
+                                                    unsigned int id)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500;
        struct kvm_vcpu *vcpu;
@@ -481,7 +485,7 @@ out:
        return ERR_PTR(err);
 }
 
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
@@ -492,15 +496,32 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
        kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
 }
 
-int kvmppc_core_init_vm(struct kvm *kvm)
+static int kvmppc_core_init_vm_e500(struct kvm *kvm)
 {
        return 0;
 }
 
-void kvmppc_core_destroy_vm(struct kvm *kvm)
+static void kvmppc_core_destroy_vm_e500(struct kvm *kvm)
 {
 }
 
+static struct kvmppc_ops kvm_ops_e500 = {
+       .get_sregs = kvmppc_core_get_sregs_e500,
+       .set_sregs = kvmppc_core_set_sregs_e500,
+       .get_one_reg = kvmppc_get_one_reg_e500,
+       .set_one_reg = kvmppc_set_one_reg_e500,
+       .vcpu_load   = kvmppc_core_vcpu_load_e500,
+       .vcpu_put    = kvmppc_core_vcpu_put_e500,
+       .vcpu_create = kvmppc_core_vcpu_create_e500,
+       .vcpu_free   = kvmppc_core_vcpu_free_e500,
+       .mmu_destroy  = kvmppc_mmu_destroy_e500,
+       .init_vm = kvmppc_core_init_vm_e500,
+       .destroy_vm = kvmppc_core_destroy_vm_e500,
+       .emulate_op = kvmppc_core_emulate_op_e500,
+       .emulate_mtspr = kvmppc_core_emulate_mtspr_e500,
+       .emulate_mfspr = kvmppc_core_emulate_mfspr_e500,
+};
+
 static int __init kvmppc_e500_init(void)
 {
        int r, i;
@@ -512,11 +533,11 @@ static int __init kvmppc_e500_init(void)
 
        r = kvmppc_core_check_processor_compat();
        if (r)
-               return r;
+               goto err_out;
 
        r = kvmppc_booke_init();
        if (r)
-               return r;
+               goto err_out;
 
        /* copy extra E500 exception handlers */
        ivor[0] = mfspr(SPRN_IVOR32);
@@ -534,11 +555,19 @@ static int __init kvmppc_e500_init(void)
        flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
                           ivor[max_ivor] + handler_len);
 
-       return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+       r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+       if (r)
+               goto err_out;
+       kvm_ops_e500.owner = THIS_MODULE;
+       kvmppc_pr_ops = &kvm_ops_e500;
+
+err_out:
+       return r;
 }
 
 static void __exit kvmppc_e500_exit(void)
 {
+       kvmppc_pr_ops = NULL;
        kvmppc_booke_exit();
 }
 
index c2e5e98..4fd9650 100644 (file)
@@ -117,7 +117,7 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
 #define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW)
 #define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW)
 #define MAS2_ATTRIB_MASK \
-         (MAS2_X0 | MAS2_X1)
+         (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G)
 #define MAS3_ATTRIB_MASK \
          (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
           | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
index b10a012..89b7f82 100644 (file)
@@ -26,6 +26,7 @@
 #define XOP_TLBRE   946
 #define XOP_TLBWE   978
 #define XOP_TLBILX  18
+#define XOP_EHPRIV  270
 
 #ifdef CONFIG_KVM_E500MC
 static int dbell2prio(ulong param)
@@ -82,8 +83,28 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb)
 }
 #endif
 
-int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                           unsigned int inst, int *advance)
+static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                  unsigned int inst, int *advance)
+{
+       int emulated = EMULATE_DONE;
+
+       switch (get_oc(inst)) {
+       case EHPRIV_OC_DEBUG:
+               run->exit_reason = KVM_EXIT_DEBUG;
+               run->debug.arch.address = vcpu->arch.pc;
+               run->debug.arch.status = 0;
+               kvmppc_account_exit(vcpu, DEBUG_EXITS);
+               emulated = EMULATE_EXIT_USER;
+               *advance = 0;
+               break;
+       default:
+               emulated = EMULATE_FAIL;
+       }
+       return emulated;
+}
+
+int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                               unsigned int inst, int *advance)
 {
        int emulated = EMULATE_DONE;
        int ra = get_ra(inst);
@@ -130,6 +151,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
                        break;
 
+               case XOP_EHPRIV:
+                       emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
+                                                          advance);
+                       break;
+
                default:
                        emulated = EMULATE_FAIL;
                }
@@ -146,7 +172,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
        return emulated;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
        int emulated = EMULATE_DONE;
@@ -237,7 +263,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
        return emulated;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
        int emulated = EMULATE_DONE;
index 6d6f153..ebca6b8 100644 (file)
@@ -32,7 +32,7 @@
 #include <asm/kvm_ppc.h>
 
 #include "e500.h"
-#include "trace.h"
+#include "trace_booke.h"
 #include "timing.h"
 #include "e500_mmu_host.h"
 
@@ -536,7 +536,7 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
        return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
 }
 
-void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu)
 {
 }
 
index c65593a..ecf2247 100644 (file)
 #include <asm/kvm_ppc.h>
 
 #include "e500.h"
-#include "trace.h"
 #include "timing.h"
 #include "e500_mmu_host.h"
 
+#include "trace_booke.h"
+
 #define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
 
 static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
@@ -253,6 +254,9 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
        ref->pfn = pfn;
        ref->flags |= E500_TLB_VALID;
 
+       /* Mark the page accessed */
+       kvm_set_pfn_accessed(pfn);
+
        if (tlbe_is_writable(gtlbe))
                kvm_set_pfn_dirty(pfn);
 }
index 19c8379..4132cd2 100644 (file)
@@ -110,7 +110,7 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
@@ -147,7 +147,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        kvmppc_load_guest_fp(vcpu);
 }
 
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.eplc = mfspr(SPRN_EPLC);
        vcpu->arch.epsc = mfspr(SPRN_EPSC);
@@ -204,7 +204,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu,
+                                       struct kvm_sregs *sregs)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
@@ -224,10 +225,11 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL];
        sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];
 
-       kvmppc_get_sregs_ivor(vcpu, sregs);
+       return kvmppc_get_sregs_ivor(vcpu, sregs);
 }
 
-int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu,
+                                       struct kvm_sregs *sregs)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
        int ret;
@@ -260,21 +262,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
-int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
-                       union kvmppc_one_reg *val)
+static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id,
+                             union kvmppc_one_reg *val)
 {
        int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
        return r;
 }
 
-int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
-                      union kvmppc_one_reg *val)
+static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id,
+                             union kvmppc_one_reg *val)
 {
        int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val);
        return r;
 }
 
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm,
+                                                      unsigned int id)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500;
        struct kvm_vcpu *vcpu;
@@ -315,7 +318,7 @@ out:
        return ERR_PTR(err);
 }
 
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
@@ -325,7 +328,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
        kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
 }
 
-int kvmppc_core_init_vm(struct kvm *kvm)
+static int kvmppc_core_init_vm_e500mc(struct kvm *kvm)
 {
        int lpid;
 
@@ -337,27 +340,52 @@ int kvmppc_core_init_vm(struct kvm *kvm)
        return 0;
 }
 
-void kvmppc_core_destroy_vm(struct kvm *kvm)
+static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm)
 {
        kvmppc_free_lpid(kvm->arch.lpid);
 }
 
+static struct kvmppc_ops kvm_ops_e500mc = {
+       .get_sregs = kvmppc_core_get_sregs_e500mc,
+       .set_sregs = kvmppc_core_set_sregs_e500mc,
+       .get_one_reg = kvmppc_get_one_reg_e500mc,
+       .set_one_reg = kvmppc_set_one_reg_e500mc,
+       .vcpu_load   = kvmppc_core_vcpu_load_e500mc,
+       .vcpu_put    = kvmppc_core_vcpu_put_e500mc,
+       .vcpu_create = kvmppc_core_vcpu_create_e500mc,
+       .vcpu_free   = kvmppc_core_vcpu_free_e500mc,
+       .mmu_destroy  = kvmppc_mmu_destroy_e500,
+       .init_vm = kvmppc_core_init_vm_e500mc,
+       .destroy_vm = kvmppc_core_destroy_vm_e500mc,
+       .emulate_op = kvmppc_core_emulate_op_e500,
+       .emulate_mtspr = kvmppc_core_emulate_mtspr_e500,
+       .emulate_mfspr = kvmppc_core_emulate_mfspr_e500,
+};
+
 static int __init kvmppc_e500mc_init(void)
 {
        int r;
 
        r = kvmppc_booke_init();
        if (r)
-               return r;
+               goto err_out;
 
        kvmppc_init_lpid(64);
        kvmppc_claim_lpid(0); /* host */
 
-       return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+       r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+       if (r)
+               goto err_out;
+       kvm_ops_e500mc.owner = THIS_MODULE;
+       kvmppc_pr_ops = &kvm_ops_e500mc;
+
+err_out:
+       return r;
 }
 
 static void __exit kvmppc_e500mc_exit(void)
 {
+       kvmppc_pr_ops = NULL;
        kvmppc_booke_exit();
 }
 
index 751cd45..2f9a087 100644 (file)
@@ -130,8 +130,8 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
        case SPRN_PIR: break;
 
        default:
-               emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
-                                                    spr_val);
+               emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn,
+                                                                 spr_val);
                if (emulated == EMULATE_FAIL)
                        printk(KERN_INFO "mtspr: unknown spr "
                                "0x%x\n", sprn);
@@ -191,8 +191,8 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
                spr_val = kvmppc_get_dec(vcpu, get_tb());
                break;
        default:
-               emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
-                                                    &spr_val);
+               emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn,
+                                                                 &spr_val);
                if (unlikely(emulated == EMULATE_FAIL)) {
                        printk(KERN_INFO "mfspr: unknown spr "
                                "0x%x\n", sprn);
@@ -464,7 +464,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
        }
 
        if (emulated == EMULATE_FAIL) {
-               emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance);
+               emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst,
+                                                              &advance);
                if (emulated == EMULATE_AGAIN) {
                        advance = 0;
                } else if (emulated == EMULATE_FAIL) {
@@ -483,3 +484,4 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
        return emulated;
 }
+EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction);
index 07c0106..9ae9768 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/module.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
+struct kvmppc_ops *kvmppc_hv_ops;
+EXPORT_SYMBOL_GPL(kvmppc_hv_ops);
+struct kvmppc_ops *kvmppc_pr_ops;
+EXPORT_SYMBOL_GPL(kvmppc_pr_ops);
+
+
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
        return !!(v->arch.pending_exceptions) ||
@@ -50,7 +57,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-#ifndef CONFIG_KVM_BOOK3S_64_HV
 /*
  * Common checks before entering the guest world.  Call with interrupts
  * disabled.
@@ -125,7 +131,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 
        return r;
 }
-#endif /* CONFIG_KVM_BOOK3S_64_HV */
+EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter);
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 {
@@ -179,6 +185,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 
        return r;
 }
+EXPORT_SYMBOL_GPL(kvmppc_kvm_pv);
 
 int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
 {
@@ -192,11 +199,9 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
        if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled)
                goto out;
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
        /* HV KVM can only do PAPR mode for now */
-       if (!vcpu->arch.papr_enabled)
+       if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm))
                goto out;
-#endif
 
 #ifdef CONFIG_KVM_BOOKE_HV
        if (!cpu_has_feature(CPU_FTR_EMB_HV))
@@ -209,6 +214,7 @@ out:
        vcpu->arch.sane = r;
        return r ? 0 : -EINVAL;
 }
+EXPORT_SYMBOL_GPL(kvmppc_sanity_check);
 
 int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
@@ -243,6 +249,7 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
        return r;
 }
+EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio);
 
 int kvm_arch_hardware_enable(void *garbage)
 {
@@ -269,10 +276,35 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
-       if (type)
-               return -EINVAL;
-
+       struct kvmppc_ops *kvm_ops = NULL;
+       /*
+        * if we have both HV and PR enabled, default is HV
+        */
+       if (type == 0) {
+               if (kvmppc_hv_ops)
+                       kvm_ops = kvmppc_hv_ops;
+               else
+                       kvm_ops = kvmppc_pr_ops;
+               if (!kvm_ops)
+                       goto err_out;
+       } else  if (type == KVM_VM_PPC_HV) {
+               if (!kvmppc_hv_ops)
+                       goto err_out;
+               kvm_ops = kvmppc_hv_ops;
+       } else if (type == KVM_VM_PPC_PR) {
+               if (!kvmppc_pr_ops)
+                       goto err_out;
+               kvm_ops = kvmppc_pr_ops;
+       } else
+               goto err_out;
+
+       if (kvm_ops->owner && !try_module_get(kvm_ops->owner))
+               return -ENOENT;
+
+       kvm->arch.kvm_ops = kvm_ops;
        return kvmppc_core_init_vm(kvm);
+err_out:
+       return -EINVAL;
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -292,6 +324,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvmppc_core_destroy_vm(kvm);
 
        mutex_unlock(&kvm->lock);
+
+       /* drop the module reference */
+       module_put(kvm->arch.kvm_ops->owner);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -301,6 +336,10 @@ void kvm_arch_sync_events(struct kvm *kvm)
 int kvm_dev_ioctl_check_extension(long ext)
 {
        int r;
+       /* FIXME!!
+        * Should some of this be vm ioctl ? is it possible now ?
+        */
+       int hv_enabled = kvmppc_hv_ops ? 1 : 0;
 
        switch (ext) {
 #ifdef CONFIG_BOOKE
@@ -320,22 +359,26 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_DEVICE_CTRL:
                r = 1;
                break;
-#ifndef CONFIG_KVM_BOOK3S_64_HV
        case KVM_CAP_PPC_PAIRED_SINGLES:
        case KVM_CAP_PPC_OSI:
        case KVM_CAP_PPC_GET_PVINFO:
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
        case KVM_CAP_SW_TLB:
 #endif
-#ifdef CONFIG_KVM_MPIC
-       case KVM_CAP_IRQ_MPIC:
-#endif
-               r = 1;
+               /* We support this only for PR */
+               r = !hv_enabled;
                break;
+#ifdef CONFIG_KVM_MMIO
        case KVM_CAP_COALESCED_MMIO:
                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
                break;
 #endif
+#ifdef CONFIG_KVM_MPIC
+       case KVM_CAP_IRQ_MPIC:
+               r = 1;
+               break;
+#endif
+
 #ifdef CONFIG_PPC_BOOK3S_64
        case KVM_CAP_SPAPR_TCE:
        case KVM_CAP_PPC_ALLOC_HTAB:
@@ -346,32 +389,37 @@ int kvm_dev_ioctl_check_extension(long ext)
                r = 1;
                break;
 #endif /* CONFIG_PPC_BOOK3S_64 */
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        case KVM_CAP_PPC_SMT:
-               r = threads_per_core;
+               if (hv_enabled)
+                       r = threads_per_core;
+               else
+                       r = 0;
                break;
        case KVM_CAP_PPC_RMA:
-               r = 1;
+               r = hv_enabled;
                /* PPC970 requires an RMA */
-               if (cpu_has_feature(CPU_FTR_ARCH_201))
+               if (r && cpu_has_feature(CPU_FTR_ARCH_201))
                        r = 2;
                break;
 #endif
        case KVM_CAP_SYNC_MMU:
-#ifdef CONFIG_KVM_BOOK3S_64_HV
-               r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+               if (hv_enabled)
+                       r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
+               else
+                       r = 0;
 #elif defined(KVM_ARCH_WANT_MMU_NOTIFIER)
                r = 1;
 #else
                r = 0;
-               break;
 #endif
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+               break;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        case KVM_CAP_PPC_HTAB_FD:
-               r = 1;
+               r = hv_enabled;
                break;
 #endif
-               break;
        case KVM_CAP_NR_VCPUS:
                /*
                 * Recommending a number of CPUs is somewhat arbitrary; we
@@ -379,11 +427,10 @@ int kvm_dev_ioctl_check_extension(long ext)
                 * will have secondary threads "offline"), and for other KVM
                 * implementations just count online CPUs.
                 */
-#ifdef CONFIG_KVM_BOOK3S_64_HV
-               r = num_present_cpus();
-#else
-               r = num_online_cpus();
-#endif
+               if (hv_enabled)
+                       r = num_present_cpus();
+               else
+                       r = num_online_cpus();
                break;
        case KVM_CAP_MAX_VCPUS:
                r = KVM_MAX_VCPUS;
@@ -407,15 +454,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
        return -EINVAL;
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
-       kvmppc_core_free_memslot(free, dont);
+       kvmppc_core_free_memslot(kvm, free, dont);
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                           unsigned long npages)
 {
-       return kvmppc_core_create_memslot(slot, npages);
+       return kvmppc_core_create_memslot(kvm, slot, npages);
 }
 
 void kvm_arch_memslots_updated(struct kvm *kvm)
@@ -659,6 +707,7 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        return EMULATE_DO_MMIO;
 }
+EXPORT_SYMBOL_GPL(kvmppc_handle_load);
 
 /* Same as above, but sign extends */
 int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -720,6 +769,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        return EMULATE_DO_MMIO;
 }
+EXPORT_SYMBOL_GPL(kvmppc_handle_store);
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
@@ -1024,52 +1074,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
                goto out;
        }
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
-#ifdef CONFIG_KVM_BOOK3S_64_HV
-       case KVM_ALLOCATE_RMA: {
-               struct kvm_allocate_rma rma;
-               struct kvm *kvm = filp->private_data;
-
-               r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
-               if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
-                       r = -EFAULT;
-               break;
-       }
-
-       case KVM_PPC_ALLOCATE_HTAB: {
-               u32 htab_order;
-
-               r = -EFAULT;
-               if (get_user(htab_order, (u32 __user *)argp))
-                       break;
-               r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
-               if (r)
-                       break;
-               r = -EFAULT;
-               if (put_user(htab_order, (u32 __user *)argp))
-                       break;
-               r = 0;
-               break;
-       }
-
-       case KVM_PPC_GET_HTAB_FD: {
-               struct kvm_get_htab_fd ghf;
-
-               r = -EFAULT;
-               if (copy_from_user(&ghf, argp, sizeof(ghf)))
-                       break;
-               r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
-               break;
-       }
-#endif /* CONFIG_KVM_BOOK3S_64_HV */
-
-#ifdef CONFIG_PPC_BOOK3S_64
        case KVM_PPC_GET_SMMU_INFO: {
                struct kvm_ppc_smmu_info info;
+               struct kvm *kvm = filp->private_data;
 
                memset(&info, 0, sizeof(info));
-               r = kvm_vm_ioctl_get_smmu_info(kvm, &info);
+               r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info);
                if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
                        r = -EFAULT;
                break;
@@ -1080,11 +1090,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
                break;
        }
-#endif /* CONFIG_PPC_BOOK3S_64 */
+       default: {
+               struct kvm *kvm = filp->private_data;
+               r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
+       }
+#else /* CONFIG_PPC_BOOK3S_64 */
        default:
                r = -ENOTTY;
+#endif
        }
-
 out:
        return r;
 }
@@ -1106,22 +1120,26 @@ long kvmppc_alloc_lpid(void)
 
        return lpid;
 }
+EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid);
 
 void kvmppc_claim_lpid(long lpid)
 {
        set_bit(lpid, lpid_inuse);
 }
+EXPORT_SYMBOL_GPL(kvmppc_claim_lpid);
 
 void kvmppc_free_lpid(long lpid)
 {
        clear_bit(lpid, lpid_inuse);
 }
+EXPORT_SYMBOL_GPL(kvmppc_free_lpid);
 
 void kvmppc_init_lpid(unsigned long nr_lpids_param)
 {
        nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param);
        memset(lpid_inuse, 0, sizeof(lpid_inuse));
 }
+EXPORT_SYMBOL_GPL(kvmppc_init_lpid);
 
 int kvm_arch_init(void *opaque)
 {
@@ -1130,4 +1148,5 @@ int kvm_arch_init(void *opaque)
 
 void kvm_arch_exit(void)
 {
+
 }
index e326489..2e0e67e 100644 (file)
@@ -31,126 +31,6 @@ TRACE_EVENT(kvm_ppc_instr,
                  __entry->inst, __entry->pc, __entry->emulate)
 );
 
-#ifdef CONFIG_PPC_BOOK3S
-#define kvm_trace_symbol_exit \
-       {0x100, "SYSTEM_RESET"}, \
-       {0x200, "MACHINE_CHECK"}, \
-       {0x300, "DATA_STORAGE"}, \
-       {0x380, "DATA_SEGMENT"}, \
-       {0x400, "INST_STORAGE"}, \
-       {0x480, "INST_SEGMENT"}, \
-       {0x500, "EXTERNAL"}, \
-       {0x501, "EXTERNAL_LEVEL"}, \
-       {0x502, "EXTERNAL_HV"}, \
-       {0x600, "ALIGNMENT"}, \
-       {0x700, "PROGRAM"}, \
-       {0x800, "FP_UNAVAIL"}, \
-       {0x900, "DECREMENTER"}, \
-       {0x980, "HV_DECREMENTER"}, \
-       {0xc00, "SYSCALL"}, \
-       {0xd00, "TRACE"}, \
-       {0xe00, "H_DATA_STORAGE"}, \
-       {0xe20, "H_INST_STORAGE"}, \
-       {0xe40, "H_EMUL_ASSIST"}, \
-       {0xf00, "PERFMON"}, \
-       {0xf20, "ALTIVEC"}, \
-       {0xf40, "VSX"}
-#else
-#define kvm_trace_symbol_exit \
-       {0, "CRITICAL"}, \
-       {1, "MACHINE_CHECK"}, \
-       {2, "DATA_STORAGE"}, \
-       {3, "INST_STORAGE"}, \
-       {4, "EXTERNAL"}, \
-       {5, "ALIGNMENT"}, \
-       {6, "PROGRAM"}, \
-       {7, "FP_UNAVAIL"}, \
-       {8, "SYSCALL"}, \
-       {9, "AP_UNAVAIL"}, \
-       {10, "DECREMENTER"}, \
-       {11, "FIT"}, \
-       {12, "WATCHDOG"}, \
-       {13, "DTLB_MISS"}, \
-       {14, "ITLB_MISS"}, \
-       {15, "DEBUG"}, \
-       {32, "SPE_UNAVAIL"}, \
-       {33, "SPE_FP_DATA"}, \
-       {34, "SPE_FP_ROUND"}, \
-       {35, "PERFORMANCE_MONITOR"}, \
-       {36, "DOORBELL"}, \
-       {37, "DOORBELL_CRITICAL"}, \
-       {38, "GUEST_DBELL"}, \
-       {39, "GUEST_DBELL_CRIT"}, \
-       {40, "HV_SYSCALL"}, \
-       {41, "HV_PRIV"}
-#endif
-
-TRACE_EVENT(kvm_exit,
-       TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
-       TP_ARGS(exit_nr, vcpu),
-
-       TP_STRUCT__entry(
-               __field(        unsigned int,   exit_nr         )
-               __field(        unsigned long,  pc              )
-               __field(        unsigned long,  msr             )
-               __field(        unsigned long,  dar             )
-#ifdef CONFIG_KVM_BOOK3S_PR
-               __field(        unsigned long,  srr1            )
-#endif
-               __field(        unsigned long,  last_inst       )
-       ),
-
-       TP_fast_assign(
-#ifdef CONFIG_KVM_BOOK3S_PR
-               struct kvmppc_book3s_shadow_vcpu *svcpu;
-#endif
-               __entry->exit_nr        = exit_nr;
-               __entry->pc             = kvmppc_get_pc(vcpu);
-               __entry->dar            = kvmppc_get_fault_dar(vcpu);
-               __entry->msr            = vcpu->arch.shared->msr;
-#ifdef CONFIG_KVM_BOOK3S_PR
-               svcpu = svcpu_get(vcpu);
-               __entry->srr1           = svcpu->shadow_srr1;
-               svcpu_put(svcpu);
-#endif
-               __entry->last_inst      = vcpu->arch.last_inst;
-       ),
-
-       TP_printk("exit=%s"
-               " | pc=0x%lx"
-               " | msr=0x%lx"
-               " | dar=0x%lx"
-#ifdef CONFIG_KVM_BOOK3S_PR
-               " | srr1=0x%lx"
-#endif
-               " | last_inst=0x%lx"
-               ,
-               __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
-               __entry->pc,
-               __entry->msr,
-               __entry->dar,
-#ifdef CONFIG_KVM_BOOK3S_PR
-               __entry->srr1,
-#endif
-               __entry->last_inst
-               )
-);
-
-TRACE_EVENT(kvm_unmap_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("unmap hva 0x%lx\n", __entry->hva)
-);
-
 TRACE_EVENT(kvm_stlb_inval,
        TP_PROTO(unsigned int stlb_index),
        TP_ARGS(stlb_index),
@@ -236,315 +116,6 @@ TRACE_EVENT(kvm_check_requests,
                __entry->cpu_nr, __entry->requests)
 );
 
-
-/*************************************************************************
- *                         Book3S trace points                           *
- *************************************************************************/
-
-#ifdef CONFIG_KVM_BOOK3S_PR
-
-TRACE_EVENT(kvm_book3s_reenter,
-       TP_PROTO(int r, struct kvm_vcpu *vcpu),
-       TP_ARGS(r, vcpu),
-
-       TP_STRUCT__entry(
-               __field(        unsigned int,   r               )
-               __field(        unsigned long,  pc              )
-       ),
-
-       TP_fast_assign(
-               __entry->r              = r;
-               __entry->pc             = kvmppc_get_pc(vcpu);
-       ),
-
-       TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
-);
-
-#ifdef CONFIG_PPC_BOOK3S_64
-
-TRACE_EVENT(kvm_book3s_64_mmu_map,
-       TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
-                struct kvmppc_pte *orig_pte),
-       TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
-
-       TP_STRUCT__entry(
-               __field(        unsigned char,          flag_w          )
-               __field(        unsigned char,          flag_x          )
-               __field(        unsigned long,          eaddr           )
-               __field(        unsigned long,          hpteg           )
-               __field(        unsigned long,          va              )
-               __field(        unsigned long long,     vpage           )
-               __field(        unsigned long,          hpaddr          )
-       ),
-
-       TP_fast_assign(
-               __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w';
-               __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x';
-               __entry->eaddr  = orig_pte->eaddr;
-               __entry->hpteg  = hpteg;
-               __entry->va     = va;
-               __entry->vpage  = orig_pte->vpage;
-               __entry->hpaddr = hpaddr;
-       ),
-
-       TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx",
-                 __entry->flag_w, __entry->flag_x, __entry->eaddr,
-                 __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr)
-);
-
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
-TRACE_EVENT(kvm_book3s_mmu_map,
-       TP_PROTO(struct hpte_cache *pte),
-       TP_ARGS(pte),
-
-       TP_STRUCT__entry(
-               __field(        u64,            host_vpn        )
-               __field(        u64,            pfn             )
-               __field(        ulong,          eaddr           )
-               __field(        u64,            vpage           )
-               __field(        ulong,          raddr           )
-               __field(        int,            flags           )
-       ),
-
-       TP_fast_assign(
-               __entry->host_vpn       = pte->host_vpn;
-               __entry->pfn            = pte->pfn;
-               __entry->eaddr          = pte->pte.eaddr;
-               __entry->vpage          = pte->pte.vpage;
-               __entry->raddr          = pte->pte.raddr;
-               __entry->flags          = (pte->pte.may_read ? 0x4 : 0) |
-                                         (pte->pte.may_write ? 0x2 : 0) |
-                                         (pte->pte.may_execute ? 0x1 : 0);
-       ),
-
-       TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
-                 __entry->host_vpn, __entry->pfn, __entry->eaddr,
-                 __entry->vpage, __entry->raddr, __entry->flags)
-);
-
-TRACE_EVENT(kvm_book3s_mmu_invalidate,
-       TP_PROTO(struct hpte_cache *pte),
-       TP_ARGS(pte),
-
-       TP_STRUCT__entry(
-               __field(        u64,            host_vpn        )
-               __field(        u64,            pfn             )
-               __field(        ulong,          eaddr           )
-               __field(        u64,            vpage           )
-               __field(        ulong,          raddr           )
-               __field(        int,            flags           )
-       ),
-
-       TP_fast_assign(
-               __entry->host_vpn       = pte->host_vpn;
-               __entry->pfn            = pte->pfn;
-               __entry->eaddr          = pte->pte.eaddr;
-               __entry->vpage          = pte->pte.vpage;
-               __entry->raddr          = pte->pte.raddr;
-               __entry->flags          = (pte->pte.may_read ? 0x4 : 0) |
-                                         (pte->pte.may_write ? 0x2 : 0) |
-                                         (pte->pte.may_execute ? 0x1 : 0);
-       ),
-
-       TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
-                 __entry->host_vpn, __entry->pfn, __entry->eaddr,
-                 __entry->vpage, __entry->raddr, __entry->flags)
-);
-
-TRACE_EVENT(kvm_book3s_mmu_flush,
-       TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1,
-                unsigned long long p2),
-       TP_ARGS(type, vcpu, p1, p2),
-
-       TP_STRUCT__entry(
-               __field(        int,                    count           )
-               __field(        unsigned long long,     p1              )
-               __field(        unsigned long long,     p2              )
-               __field(        const char *,           type            )
-       ),
-
-       TP_fast_assign(
-               __entry->count          = to_book3s(vcpu)->hpte_cache_count;
-               __entry->p1             = p1;
-               __entry->p2             = p2;
-               __entry->type           = type;
-       ),
-
-       TP_printk("Flush %d %sPTEs: %llx - %llx",
-                 __entry->count, __entry->type, __entry->p1, __entry->p2)
-);
-
-TRACE_EVENT(kvm_book3s_slb_found,
-       TP_PROTO(unsigned long long gvsid, unsigned long long hvsid),
-       TP_ARGS(gvsid, hvsid),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long long,     gvsid           )
-               __field(        unsigned long long,     hvsid           )
-       ),
-
-       TP_fast_assign(
-               __entry->gvsid          = gvsid;
-               __entry->hvsid          = hvsid;
-       ),
-
-       TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid)
-);
-
-TRACE_EVENT(kvm_book3s_slb_fail,
-       TP_PROTO(u16 sid_map_mask, unsigned long long gvsid),
-       TP_ARGS(sid_map_mask, gvsid),
-
-       TP_STRUCT__entry(
-               __field(        unsigned short,         sid_map_mask    )
-               __field(        unsigned long long,     gvsid           )
-       ),
-
-       TP_fast_assign(
-               __entry->sid_map_mask   = sid_map_mask;
-               __entry->gvsid          = gvsid;
-       ),
-
-       TP_printk("%x/%x: %llx", __entry->sid_map_mask,
-                 SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid)
-);
-
-TRACE_EVENT(kvm_book3s_slb_map,
-       TP_PROTO(u16 sid_map_mask, unsigned long long gvsid,
-                unsigned long long hvsid),
-       TP_ARGS(sid_map_mask, gvsid, hvsid),
-
-       TP_STRUCT__entry(
-               __field(        unsigned short,         sid_map_mask    )
-               __field(        unsigned long long,     guest_vsid      )
-               __field(        unsigned long long,     host_vsid       )
-       ),
-
-       TP_fast_assign(
-               __entry->sid_map_mask   = sid_map_mask;
-               __entry->guest_vsid     = gvsid;
-               __entry->host_vsid      = hvsid;
-       ),
-
-       TP_printk("%x: %llx -> %llx", __entry->sid_map_mask,
-                 __entry->guest_vsid, __entry->host_vsid)
-);
-
-TRACE_EVENT(kvm_book3s_slbmte,
-       TP_PROTO(u64 slb_vsid, u64 slb_esid),
-       TP_ARGS(slb_vsid, slb_esid),
-
-       TP_STRUCT__entry(
-               __field(        u64,    slb_vsid        )
-               __field(        u64,    slb_esid        )
-       ),
-
-       TP_fast_assign(
-               __entry->slb_vsid       = slb_vsid;
-               __entry->slb_esid       = slb_esid;
-       ),
-
-       TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid)
-);
-
-#endif /* CONFIG_PPC_BOOK3S */
-
-
-/*************************************************************************
- *                         Book3E trace points                           *
- *************************************************************************/
-
-#ifdef CONFIG_BOOKE
-
-TRACE_EVENT(kvm_booke206_stlb_write,
-       TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
-       TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
-
-       TP_STRUCT__entry(
-               __field(        __u32,  mas0            )
-               __field(        __u32,  mas8            )
-               __field(        __u32,  mas1            )
-               __field(        __u64,  mas2            )
-               __field(        __u64,  mas7_3          )
-       ),
-
-       TP_fast_assign(
-               __entry->mas0           = mas0;
-               __entry->mas8           = mas8;
-               __entry->mas1           = mas1;
-               __entry->mas2           = mas2;
-               __entry->mas7_3         = mas7_3;
-       ),
-
-       TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx",
-               __entry->mas0, __entry->mas8, __entry->mas1,
-               __entry->mas2, __entry->mas7_3)
-);
-
-TRACE_EVENT(kvm_booke206_gtlb_write,
-       TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3),
-       TP_ARGS(mas0, mas1, mas2, mas7_3),
-
-       TP_STRUCT__entry(
-               __field(        __u32,  mas0            )
-               __field(        __u32,  mas1            )
-               __field(        __u64,  mas2            )
-               __field(        __u64,  mas7_3          )
-       ),
-
-       TP_fast_assign(
-               __entry->mas0           = mas0;
-               __entry->mas1           = mas1;
-               __entry->mas2           = mas2;
-               __entry->mas7_3         = mas7_3;
-       ),
-
-       TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx",
-               __entry->mas0, __entry->mas1,
-               __entry->mas2, __entry->mas7_3)
-);
-
-TRACE_EVENT(kvm_booke206_ref_release,
-       TP_PROTO(__u64 pfn, __u32 flags),
-       TP_ARGS(pfn, flags),
-
-       TP_STRUCT__entry(
-               __field(        __u64,  pfn             )
-               __field(        __u32,  flags           )
-       ),
-
-       TP_fast_assign(
-               __entry->pfn            = pfn;
-               __entry->flags          = flags;
-       ),
-
-       TP_printk("pfn=%llx flags=%x",
-               __entry->pfn, __entry->flags)
-);
-
-TRACE_EVENT(kvm_booke_queue_irqprio,
-       TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
-       TP_ARGS(vcpu, priority),
-
-       TP_STRUCT__entry(
-               __field(        __u32,  cpu_nr          )
-               __field(        __u32,  priority                )
-               __field(        unsigned long,  pending         )
-       ),
-
-       TP_fast_assign(
-               __entry->cpu_nr         = vcpu->vcpu_id;
-               __entry->priority       = priority;
-               __entry->pending        = vcpu->arch.pending_exceptions;
-       ),
-
-       TP_printk("vcpu=%x prio=%x pending=%lx",
-               __entry->cpu_nr, __entry->priority, __entry->pending)
-);
-
-#endif
-
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h
new file mode 100644 (file)
index 0000000..f7537cf
--- /dev/null
@@ -0,0 +1,177 @@
+#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_BOOKE_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm_booke
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_booke
+
+#define kvm_trace_symbol_exit \
+       {0, "CRITICAL"}, \
+       {1, "MACHINE_CHECK"}, \
+       {2, "DATA_STORAGE"}, \
+       {3, "INST_STORAGE"}, \
+       {4, "EXTERNAL"}, \
+       {5, "ALIGNMENT"}, \
+       {6, "PROGRAM"}, \
+       {7, "FP_UNAVAIL"}, \
+       {8, "SYSCALL"}, \
+       {9, "AP_UNAVAIL"}, \
+       {10, "DECREMENTER"}, \
+       {11, "FIT"}, \
+       {12, "WATCHDOG"}, \
+       {13, "DTLB_MISS"}, \
+       {14, "ITLB_MISS"}, \
+       {15, "DEBUG"}, \
+       {32, "SPE_UNAVAIL"}, \
+       {33, "SPE_FP_DATA"}, \
+       {34, "SPE_FP_ROUND"}, \
+       {35, "PERFORMANCE_MONITOR"}, \
+       {36, "DOORBELL"}, \
+       {37, "DOORBELL_CRITICAL"}, \
+       {38, "GUEST_DBELL"}, \
+       {39, "GUEST_DBELL_CRIT"}, \
+       {40, "HV_SYSCALL"}, \
+       {41, "HV_PRIV"}
+
+TRACE_EVENT(kvm_exit,
+       TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+       TP_ARGS(exit_nr, vcpu),
+
+       TP_STRUCT__entry(
+               __field(        unsigned int,   exit_nr         )
+               __field(        unsigned long,  pc              )
+               __field(        unsigned long,  msr             )
+               __field(        unsigned long,  dar             )
+               __field(        unsigned long,  last_inst       )
+       ),
+
+       TP_fast_assign(
+               __entry->exit_nr        = exit_nr;
+               __entry->pc             = kvmppc_get_pc(vcpu);
+               __entry->dar            = kvmppc_get_fault_dar(vcpu);
+               __entry->msr            = vcpu->arch.shared->msr;
+               __entry->last_inst      = vcpu->arch.last_inst;
+       ),
+
+       TP_printk("exit=%s"
+               " | pc=0x%lx"
+               " | msr=0x%lx"
+               " | dar=0x%lx"
+               " | last_inst=0x%lx"
+               ,
+               __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
+               __entry->pc,
+               __entry->msr,
+               __entry->dar,
+               __entry->last_inst
+               )
+);
+
+TRACE_EVENT(kvm_unmap_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("unmap hva 0x%lx\n", __entry->hva)
+);
+
+TRACE_EVENT(kvm_booke206_stlb_write,
+       TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
+       TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
+
+       TP_STRUCT__entry(
+               __field(        __u32,  mas0            )
+               __field(        __u32,  mas8            )
+               __field(        __u32,  mas1            )
+               __field(        __u64,  mas2            )
+               __field(        __u64,  mas7_3          )
+       ),
+
+       TP_fast_assign(
+               __entry->mas0           = mas0;
+               __entry->mas8           = mas8;
+               __entry->mas1           = mas1;
+               __entry->mas2           = mas2;
+               __entry->mas7_3         = mas7_3;
+       ),
+
+       TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx",
+               __entry->mas0, __entry->mas8, __entry->mas1,
+               __entry->mas2, __entry->mas7_3)
+);
+
+TRACE_EVENT(kvm_booke206_gtlb_write,
+       TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3),
+       TP_ARGS(mas0, mas1, mas2, mas7_3),
+
+       TP_STRUCT__entry(
+               __field(        __u32,  mas0            )
+               __field(        __u32,  mas1            )
+               __field(        __u64,  mas2            )
+               __field(        __u64,  mas7_3          )
+       ),
+
+       TP_fast_assign(
+               __entry->mas0           = mas0;
+               __entry->mas1           = mas1;
+               __entry->mas2           = mas2;
+               __entry->mas7_3         = mas7_3;
+       ),
+
+       TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx",
+               __entry->mas0, __entry->mas1,
+               __entry->mas2, __entry->mas7_3)
+);
+
+TRACE_EVENT(kvm_booke206_ref_release,
+       TP_PROTO(__u64 pfn, __u32 flags),
+       TP_ARGS(pfn, flags),
+
+       TP_STRUCT__entry(
+               __field(        __u64,  pfn             )
+               __field(        __u32,  flags           )
+       ),
+
+       TP_fast_assign(
+               __entry->pfn            = pfn;
+               __entry->flags          = flags;
+       ),
+
+       TP_printk("pfn=%llx flags=%x",
+               __entry->pfn, __entry->flags)
+);
+
+TRACE_EVENT(kvm_booke_queue_irqprio,
+       TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
+       TP_ARGS(vcpu, priority),
+
+       TP_STRUCT__entry(
+               __field(        __u32,  cpu_nr          )
+               __field(        __u32,  priority                )
+               __field(        unsigned long,  pending         )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu_nr         = vcpu->vcpu_id;
+               __entry->priority       = priority;
+               __entry->pending        = vcpu->arch.pending_exceptions;
+       ),
+
+       TP_printk("vcpu=%x prio=%x pending=%lx",
+               __entry->cpu_nr, __entry->priority, __entry->pending)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h
new file mode 100644 (file)
index 0000000..8b22e47
--- /dev/null
@@ -0,0 +1,297 @@
+
+#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_PR_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm_pr
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_pr
+
+#define kvm_trace_symbol_exit \
+       {0x100, "SYSTEM_RESET"}, \
+       {0x200, "MACHINE_CHECK"}, \
+       {0x300, "DATA_STORAGE"}, \
+       {0x380, "DATA_SEGMENT"}, \
+       {0x400, "INST_STORAGE"}, \
+       {0x480, "INST_SEGMENT"}, \
+       {0x500, "EXTERNAL"}, \
+       {0x501, "EXTERNAL_LEVEL"}, \
+       {0x502, "EXTERNAL_HV"}, \
+       {0x600, "ALIGNMENT"}, \
+       {0x700, "PROGRAM"}, \
+       {0x800, "FP_UNAVAIL"}, \
+       {0x900, "DECREMENTER"}, \
+       {0x980, "HV_DECREMENTER"}, \
+       {0xc00, "SYSCALL"}, \
+       {0xd00, "TRACE"}, \
+       {0xe00, "H_DATA_STORAGE"}, \
+       {0xe20, "H_INST_STORAGE"}, \
+       {0xe40, "H_EMUL_ASSIST"}, \
+       {0xf00, "PERFMON"}, \
+       {0xf20, "ALTIVEC"}, \
+       {0xf40, "VSX"}
+
+TRACE_EVENT(kvm_book3s_reenter,
+       TP_PROTO(int r, struct kvm_vcpu *vcpu),
+       TP_ARGS(r, vcpu),
+
+       TP_STRUCT__entry(
+               __field(        unsigned int,   r               )
+               __field(        unsigned long,  pc              )
+       ),
+
+       TP_fast_assign(
+               __entry->r              = r;
+               __entry->pc             = kvmppc_get_pc(vcpu);
+       ),
+
+       TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
+);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+TRACE_EVENT(kvm_book3s_64_mmu_map,
+       TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
+                struct kvmppc_pte *orig_pte),
+       TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
+
+       TP_STRUCT__entry(
+               __field(        unsigned char,          flag_w          )
+               __field(        unsigned char,          flag_x          )
+               __field(        unsigned long,          eaddr           )
+               __field(        unsigned long,          hpteg           )
+               __field(        unsigned long,          va              )
+               __field(        unsigned long long,     vpage           )
+               __field(        unsigned long,          hpaddr          )
+       ),
+
+       TP_fast_assign(
+               __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w';
+               __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x';
+               __entry->eaddr  = orig_pte->eaddr;
+               __entry->hpteg  = hpteg;
+               __entry->va     = va;
+               __entry->vpage  = orig_pte->vpage;
+               __entry->hpaddr = hpaddr;
+       ),
+
+       TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx",
+                 __entry->flag_w, __entry->flag_x, __entry->eaddr,
+                 __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr)
+);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+TRACE_EVENT(kvm_book3s_mmu_map,
+       TP_PROTO(struct hpte_cache *pte),
+       TP_ARGS(pte),
+
+       TP_STRUCT__entry(
+               __field(        u64,            host_vpn        )
+               __field(        u64,            pfn             )
+               __field(        ulong,          eaddr           )
+               __field(        u64,            vpage           )
+               __field(        ulong,          raddr           )
+               __field(        int,            flags           )
+       ),
+
+       TP_fast_assign(
+               __entry->host_vpn       = pte->host_vpn;
+               __entry->pfn            = pte->pfn;
+               __entry->eaddr          = pte->pte.eaddr;
+               __entry->vpage          = pte->pte.vpage;
+               __entry->raddr          = pte->pte.raddr;
+               __entry->flags          = (pte->pte.may_read ? 0x4 : 0) |
+                                         (pte->pte.may_write ? 0x2 : 0) |
+                                         (pte->pte.may_execute ? 0x1 : 0);
+       ),
+
+       TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+                 __entry->host_vpn, __entry->pfn, __entry->eaddr,
+                 __entry->vpage, __entry->raddr, __entry->flags)
+);
+
+TRACE_EVENT(kvm_book3s_mmu_invalidate,
+       TP_PROTO(struct hpte_cache *pte),
+       TP_ARGS(pte),
+
+       TP_STRUCT__entry(
+               __field(        u64,            host_vpn        )
+               __field(        u64,            pfn             )
+               __field(        ulong,          eaddr           )
+               __field(        u64,            vpage           )
+               __field(        ulong,          raddr           )
+               __field(        int,            flags           )
+       ),
+
+       TP_fast_assign(
+               __entry->host_vpn       = pte->host_vpn;
+               __entry->pfn            = pte->pfn;
+               __entry->eaddr          = pte->pte.eaddr;
+               __entry->vpage          = pte->pte.vpage;
+               __entry->raddr          = pte->pte.raddr;
+               __entry->flags          = (pte->pte.may_read ? 0x4 : 0) |
+                                         (pte->pte.may_write ? 0x2 : 0) |
+                                         (pte->pte.may_execute ? 0x1 : 0);
+       ),
+
+       TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+                 __entry->host_vpn, __entry->pfn, __entry->eaddr,
+                 __entry->vpage, __entry->raddr, __entry->flags)
+);
+
+TRACE_EVENT(kvm_book3s_mmu_flush,
+       TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1,
+                unsigned long long p2),
+       TP_ARGS(type, vcpu, p1, p2),
+
+       TP_STRUCT__entry(
+               __field(        int,                    count           )
+               __field(        unsigned long long,     p1              )
+               __field(        unsigned long long,     p2              )
+               __field(        const char *,           type            )
+       ),
+
+       TP_fast_assign(
+               __entry->count          = to_book3s(vcpu)->hpte_cache_count;
+               __entry->p1             = p1;
+               __entry->p2             = p2;
+               __entry->type           = type;
+       ),
+
+       TP_printk("Flush %d %sPTEs: %llx - %llx",
+                 __entry->count, __entry->type, __entry->p1, __entry->p2)
+);
+
+TRACE_EVENT(kvm_book3s_slb_found,
+       TP_PROTO(unsigned long long gvsid, unsigned long long hvsid),
+       TP_ARGS(gvsid, hvsid),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long long,     gvsid           )
+               __field(        unsigned long long,     hvsid           )
+       ),
+
+       TP_fast_assign(
+               __entry->gvsid          = gvsid;
+               __entry->hvsid          = hvsid;
+       ),
+
+       TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_fail,
+       TP_PROTO(u16 sid_map_mask, unsigned long long gvsid),
+       TP_ARGS(sid_map_mask, gvsid),
+
+       TP_STRUCT__entry(
+               __field(        unsigned short,         sid_map_mask    )
+               __field(        unsigned long long,     gvsid           )
+       ),
+
+       TP_fast_assign(
+               __entry->sid_map_mask   = sid_map_mask;
+               __entry->gvsid          = gvsid;
+       ),
+
+       TP_printk("%x/%x: %llx", __entry->sid_map_mask,
+                 SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_map,
+       TP_PROTO(u16 sid_map_mask, unsigned long long gvsid,
+                unsigned long long hvsid),
+       TP_ARGS(sid_map_mask, gvsid, hvsid),
+
+       TP_STRUCT__entry(
+               __field(        unsigned short,         sid_map_mask    )
+               __field(        unsigned long long,     guest_vsid      )
+               __field(        unsigned long long,     host_vsid       )
+       ),
+
+       TP_fast_assign(
+               __entry->sid_map_mask   = sid_map_mask;
+               __entry->guest_vsid     = gvsid;
+               __entry->host_vsid      = hvsid;
+       ),
+
+       TP_printk("%x: %llx -> %llx", __entry->sid_map_mask,
+                 __entry->guest_vsid, __entry->host_vsid)
+);
+
+TRACE_EVENT(kvm_book3s_slbmte,
+       TP_PROTO(u64 slb_vsid, u64 slb_esid),
+       TP_ARGS(slb_vsid, slb_esid),
+
+       TP_STRUCT__entry(
+               __field(        u64,    slb_vsid        )
+               __field(        u64,    slb_esid        )
+       ),
+
+       TP_fast_assign(
+               __entry->slb_vsid       = slb_vsid;
+               __entry->slb_esid       = slb_esid;
+       ),
+
+       TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid)
+);
+
+TRACE_EVENT(kvm_exit,
+       TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+       TP_ARGS(exit_nr, vcpu),
+
+       TP_STRUCT__entry(
+               __field(        unsigned int,   exit_nr         )
+               __field(        unsigned long,  pc              )
+               __field(        unsigned long,  msr             )
+               __field(        unsigned long,  dar             )
+               __field(        unsigned long,  srr1            )
+               __field(        unsigned long,  last_inst       )
+       ),
+
+       TP_fast_assign(
+               __entry->exit_nr        = exit_nr;
+               __entry->pc             = kvmppc_get_pc(vcpu);
+               __entry->dar            = kvmppc_get_fault_dar(vcpu);
+               __entry->msr            = vcpu->arch.shared->msr;
+               __entry->srr1           = vcpu->arch.shadow_srr1;
+               __entry->last_inst      = vcpu->arch.last_inst;
+       ),
+
+       TP_printk("exit=%s"
+               " | pc=0x%lx"
+               " | msr=0x%lx"
+               " | dar=0x%lx"
+               " | srr1=0x%lx"
+               " | last_inst=0x%lx"
+               ,
+               __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
+               __entry->pc,
+               __entry->msr,
+               __entry->dar,
+               __entry->srr1,
+               __entry->last_inst
+               )
+);
+
+TRACE_EVENT(kvm_unmap_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("unmap hva 0x%lx\n", __entry->hva)
+);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 6c856fb..5b96017 100644 (file)
@@ -121,7 +121,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
        ptepage = alloc_pages(flags, 0);
        if (!ptepage)
                return NULL;
-       pgtable_page_ctor(ptepage);
+       if (!pgtable_page_ctor(ptepage)) {
+               __free_page(ptepage);
+               return NULL;
+       }
        return ptepage;
 }
 
index 536eec7..9d95786 100644 (file)
@@ -378,6 +378,10 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
                                       __GFP_REPEAT | __GFP_ZERO);
        if (!page)
                return NULL;
+       if (!kernel && !pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
 
        ret = page_address(page);
        spin_lock(&mm->page_table_lock);
@@ -392,9 +396,6 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
        }
        spin_unlock(&mm->page_table_lock);
 
-       if (!kernel)
-               pgtable_page_ctor(page);
-
        return (pte_t *)ret;
 }
 
index fc536f2..7553b6a 100644 (file)
@@ -452,7 +452,7 @@ static int kw_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
         */
        if (use_irq) {
                /* Clear completion */
-               INIT_COMPLETION(host->complete);
+               reinit_completion(&host->complete);
                /* Ack stale interrupts */
                kw_write_reg(reg_isr, kw_read_reg(reg_isr));
                /* Arm timeout */
@@ -717,7 +717,7 @@ static int pmu_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
                        return -EINVAL;
                }
 
-               INIT_COMPLETION(comp);
+               reinit_completion(&comp);
                req->data[0] = PMU_I2C_CMD;
                req->reply[0] = 0xff;
                req->nbytes = sizeof(struct pmu_i2c_hdr) + 1;
@@ -748,7 +748,7 @@ static int pmu_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
 
                hdr->bus = PMU_I2C_BUS_STATUS;
 
-               INIT_COMPLETION(comp);
+               reinit_completion(&comp);
                req->data[0] = PMU_I2C_CMD;
                req->reply[0] = 0xff;
                req->nbytes = 2;
index 5f997e7..16a2552 100644 (file)
@@ -106,7 +106,7 @@ static int pseries_prepare_late(void)
        atomic_set(&suspend_data.done, 0);
        atomic_set(&suspend_data.error, 0);
        suspend_data.complete = &suspend_work;
-       INIT_COMPLETION(suspend_work);
+       reinit_completion(&suspend_work);
        return 0;
 }
 
index f75d7e5..314fced 100644 (file)
@@ -141,7 +141,6 @@ config S390
        select OLD_SIGACTION
        select OLD_SIGSUSPEND3
        select SYSCTL_EXCEPTION_TRACE
-       select USE_GENERIC_SMP_HELPERS if SMP
        select VIRT_CPU_ACCOUNTING
        select VIRT_TO_BUS
 
index e87ecaa..d5bc375 100644 (file)
@@ -38,13 +38,6 @@ struct sca_block {
        struct sca_entry cpu[64];
 } __attribute__((packed));
 
-#define KVM_NR_PAGE_SIZES 2
-#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8)
-#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
-#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
-#define KVM_HPAGE_MASK(x)      (~(KVM_HPAGE_SIZE(x) - 1))
-#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
-
 #define CPUSTAT_STOPPED    0x80000000
 #define CPUSTAT_WAIT       0x10000000
 #define CPUSTAT_ECALL_PEND 0x08000000
@@ -220,7 +213,6 @@ struct kvm_s390_interrupt_info {
 /* for local_interrupt.action_flags */
 #define ACTION_STORE_ON_STOP           (1<<0)
 #define ACTION_STOP_ON_STOP            (1<<1)
-#define ACTION_RELOADVCPU_ON_STOP      (1<<2)
 
 struct kvm_s390_local_interrupt {
        spinlock_t lock;
index 3a74d8a..78d967f 100644 (file)
@@ -107,14 +107,13 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
 
 static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
 {
-       int ret, idx;
+       int ret;
 
        /* No virtio-ccw notification? Get out quickly. */
        if (!vcpu->kvm->arch.css_support ||
            (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
                return -EOPNOTSUPP;
 
-       idx = srcu_read_lock(&vcpu->kvm->srcu);
        /*
         * The layout is as follows:
         * - gpr 2 contains the subchannel id (passed as addr)
@@ -125,7 +124,6 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
                                      vcpu->run->s.regs.gprs[2],
                                      8, &vcpu->run->s.regs.gprs[3],
                                      vcpu->run->s.regs.gprs[4]);
-       srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
        /*
         * Return cookie in gpr 2, but don't overwrite the register if the
index 99d789e..374a439 100644 (file)
 #include <asm/uaccess.h>
 #include "kvm-s390.h"
 
+/* Convert real to absolute address by applying the prefix of the CPU */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+                                                unsigned long gaddr)
+{
+       unsigned long prefix  = vcpu->arch.sie_block->prefix;
+       if (gaddr < 2 * PAGE_SIZE)
+               gaddr += prefix;
+       else if (gaddr >= prefix && gaddr < prefix + 2 * PAGE_SIZE)
+               gaddr -= prefix;
+       return gaddr;
+}
+
 static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu,
                                          void __user *gptr,
                                          int prefixing)
 {
-       unsigned long prefix  = vcpu->arch.sie_block->prefix;
        unsigned long gaddr = (unsigned long) gptr;
        unsigned long uaddr;
 
-       if (prefixing) {
-               if (gaddr < 2 * PAGE_SIZE)
-                       gaddr += prefix;
-               else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE))
-                       gaddr -= prefix;
-       }
+       if (prefixing)
+               gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
        uaddr = gmap_fault(gaddr, vcpu->arch.gmap);
        if (IS_ERR_VALUE(uaddr))
                uaddr = -EFAULT;
index 5ee56e5..5ddbbde 100644 (file)
@@ -62,12 +62,6 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 
        trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
 
-       if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
-               vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
-               rc = SIE_INTERCEPT_RERUNVCPU;
-               vcpu->run->exit_reason = KVM_EXIT_INTR;
-       }
-
        if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) {
                atomic_set_mask(CPUSTAT_STOPPED,
                                &vcpu->arch.sie_block->cpuflags);
index 7f1f7ac..5f79d2d 100644 (file)
@@ -436,6 +436,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
        hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
        VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
 no_timer:
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        spin_lock(&vcpu->arch.local_int.float_int->lock);
        spin_lock_bh(&vcpu->arch.local_int.lock);
        add_wait_queue(&vcpu->wq, &wait);
@@ -455,6 +456,8 @@ no_timer:
        remove_wait_queue(&vcpu->wq, &wait);
        spin_unlock_bh(&vcpu->arch.local_int.lock);
        spin_unlock(&vcpu->arch.local_int.float_int->lock);
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
        hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
        return 0;
 }
index ed8064c..569494e 100644 (file)
@@ -695,9 +695,9 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static int __vcpu_run(struct kvm_vcpu *vcpu)
+static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 {
-       int rc;
+       int rc, cpuflags;
 
        memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16);
 
@@ -715,28 +715,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                return rc;
 
        vcpu->arch.sie_block->icptcode = 0;
-       VCPU_EVENT(vcpu, 6, "entering sie flags %x",
-                  atomic_read(&vcpu->arch.sie_block->cpuflags));
-       trace_kvm_s390_sie_enter(vcpu,
-                                atomic_read(&vcpu->arch.sie_block->cpuflags));
+       cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
+       VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
+       trace_kvm_s390_sie_enter(vcpu, cpuflags);
 
-       /*
-        * As PF_VCPU will be used in fault handler, between guest_enter
-        * and guest_exit should be no uaccess.
-        */
-       preempt_disable();
-       kvm_guest_enter();
-       preempt_enable();
-       rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
-       kvm_guest_exit();
+       return 0;
+}
+
+static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
+{
+       int rc;
 
        VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
                   vcpu->arch.sie_block->icptcode);
        trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
 
-       if (rc > 0)
+       if (exit_reason >= 0) {
                rc = 0;
-       if (rc < 0) {
+       } else {
                if (kvm_is_ucontrol(vcpu->kvm)) {
                        rc = SIE_INTERCEPT_UCONTROL;
                } else {
@@ -747,6 +743,49 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
        }
 
        memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
+
+       if (rc == 0) {
+               if (kvm_is_ucontrol(vcpu->kvm))
+                       rc = -EOPNOTSUPP;
+               else
+                       rc = kvm_handle_sie_intercept(vcpu);
+       }
+
+       return rc;
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu)
+{
+       int rc, exit_reason;
+
+       /*
+        * We try to hold kvm->srcu during most of vcpu_run (except when run-
+        * ning the guest), so that memslots (and other stuff) are protected
+        */
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       do {
+               rc = vcpu_pre_run(vcpu);
+               if (rc)
+                       break;
+
+               srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+               /*
+                * As PF_VCPU will be used in fault handler, between
+                * guest_enter and guest_exit should be no uaccess.
+                */
+               preempt_disable();
+               kvm_guest_enter();
+               preempt_enable();
+               exit_reason = sie64a(vcpu->arch.sie_block,
+                                    vcpu->run->s.regs.gprs);
+               kvm_guest_exit();
+               vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+               rc = vcpu_post_run(vcpu, exit_reason);
+       } while (!signal_pending(current) && !rc);
+
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        return rc;
 }
 
@@ -755,7 +794,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int rc;
        sigset_t sigsaved;
 
-rerun_vcpu:
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -788,19 +826,7 @@ rerun_vcpu:
        }
 
        might_fault();
-
-       do {
-               rc = __vcpu_run(vcpu);
-               if (rc)
-                       break;
-               if (kvm_is_ucontrol(vcpu->kvm))
-                       rc = -EOPNOTSUPP;
-               else
-                       rc = kvm_handle_sie_intercept(vcpu);
-       } while (!signal_pending(current) && !rc);
-
-       if (rc == SIE_INTERCEPT_RERUNVCPU)
-               goto rerun_vcpu;
+       rc = __vcpu_run(vcpu);
 
        if (signal_pending(current) && !rc) {
                kvm_run->exit_reason = KVM_EXIT_INTR;
@@ -958,6 +984,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 {
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
+       int idx;
        long r;
 
        switch (ioctl) {
@@ -971,7 +998,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
        case KVM_S390_STORE_STATUS:
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
                r = kvm_s390_vcpu_store_status(vcpu, arg);
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
                break;
        case KVM_S390_SET_INITIAL_PSW: {
                psw_t psw;
@@ -1067,12 +1096,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                           unsigned long npages)
 {
        return 0;
 }
index dc99f1c..b44912a 100644 (file)
@@ -28,8 +28,7 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 extern unsigned long *vfacilities;
 
 /* negativ values are error codes, positive values for internal conditions */
-#define SIE_INTERCEPT_RERUNVCPU                (1<<0)
-#define SIE_INTERCEPT_UCONTROL         (1<<1)
+#define SIE_INTERCEPT_UCONTROL         (1<<0)
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
 
 #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
@@ -91,8 +90,10 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
 
 static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2)
 {
-       *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20;
-       *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
+       if (r1)
+               *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20;
+       if (r2)
+               *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
 }
 
 static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
index 59200ee..2440602 100644 (file)
 #include "kvm-s390.h"
 #include "trace.h"
 
+/* Handle SCK (SET CLOCK) interception */
+static int handle_set_clock(struct kvm_vcpu *vcpu)
+{
+       struct kvm_vcpu *cpup;
+       s64 hostclk, val;
+       u64 op2;
+       int i;
+
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+       op2 = kvm_s390_get_base_disp_s(vcpu);
+       if (op2 & 7)    /* Operand must be on a doubleword boundary */
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       if (get_guest(vcpu, val, (u64 __user *) op2))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       if (store_tod_clock(&hostclk)) {
+               kvm_s390_set_psw_cc(vcpu, 3);
+               return 0;
+       }
+       val = (val - hostclk) & ~0x3fUL;
+
+       mutex_lock(&vcpu->kvm->lock);
+       kvm_for_each_vcpu(i, cpup, vcpu->kvm)
+               cpup->arch.sie_block->epoch = val;
+       mutex_unlock(&vcpu->kvm->lock);
+
+       kvm_s390_set_psw_cc(vcpu, 0);
+       return 0;
+}
+
 static int handle_set_prefix(struct kvm_vcpu *vcpu)
 {
        u64 operand2;
@@ -128,6 +160,33 @@ static int handle_skey(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int handle_test_block(struct kvm_vcpu *vcpu)
+{
+       unsigned long hva;
+       gpa_t addr;
+       int reg2;
+
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+       kvm_s390_get_regs_rre(vcpu, NULL, &reg2);
+       addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       addr = kvm_s390_real_to_abs(vcpu, addr);
+
+       hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+       if (kvm_is_error_hva(hva))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       /*
+        * We don't expect errors on modern systems, and do not care
+        * about storage keys (yet), so let's just clear the page.
+        */
+       if (clear_user((void __user *)hva, PAGE_SIZE) != 0)
+               return -EFAULT;
+       kvm_s390_set_psw_cc(vcpu, 0);
+       vcpu->run->s.regs.gprs[0] = 0;
+       return 0;
+}
+
 static int handle_tpi(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_interrupt_info *inti;
@@ -438,12 +497,14 @@ out_exception:
 
 static const intercept_handler_t b2_handlers[256] = {
        [0x02] = handle_stidp,
+       [0x04] = handle_set_clock,
        [0x10] = handle_set_prefix,
        [0x11] = handle_store_prefix,
        [0x12] = handle_store_cpu_address,
        [0x29] = handle_skey,
        [0x2a] = handle_skey,
        [0x2b] = handle_skey,
+       [0x2c] = handle_test_block,
        [0x30] = handle_io_inst,
        [0x31] = handle_io_inst,
        [0x32] = handle_io_inst,
index 0a2e5e0..e794c88 100644 (file)
@@ -772,7 +772,11 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                __free_page(page);
                return NULL;
        }
-       pgtable_page_ctor(page);
+       if (!pgtable_page_ctor(page)) {
+               kfree(mp);
+               __free_page(page);
+               return NULL;
+       }
        mp->vmaddr = vmaddr & PMD_MASK;
        INIT_LIST_HEAD(&mp->mapper);
        page->index = (unsigned long) mp;
@@ -902,7 +906,10 @@ unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
                page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
                if (!page)
                        return NULL;
-               pgtable_page_ctor(page);
+               if (!pgtable_page_ctor(page)) {
+                       __free_page(page);
+                       return NULL;
+               }
                atomic_set(&page->_mapcount, 1);
                table = (unsigned long *) page_to_phys(page);
                clear_table(table, _PAGE_INVALID, PAGE_SIZE);
@@ -1244,11 +1251,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
        assert_spin_locked(&mm->page_table_lock);
 
        /* FIFO */
-       if (!mm->pmd_huge_pte)
+       if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(lh);
        else
-               list_add(lh, (struct list_head *) mm->pmd_huge_pte);
-       mm->pmd_huge_pte = pgtable;
+               list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+       pmd_huge_pte(mm, pmdp) = pgtable;
 }
 
 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
@@ -1260,12 +1267,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
        assert_spin_locked(&mm->page_table_lock);
 
        /* FIFO */
-       pgtable = mm->pmd_huge_pte;
+       pgtable = pmd_huge_pte(mm, pmdp);
        lh = (struct list_head *) pgtable;
        if (list_empty(lh))
-               mm->pmd_huge_pte = NULL;
+               pmd_huge_pte(mm, pmdp) = NULL;
        else {
-               mm->pmd_huge_pte = (pgtable_t) lh->next;
+               pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
                list_del(lh);
        }
        ptep = (pte_t *) pgtable;
index 716b3fd..2e06765 100644 (file)
@@ -54,9 +54,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
        struct page *pte;
 
        pte = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER);
-       if (pte) {
-               clear_highpage(pte);
-               pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       clear_highpage(pte);
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
        }
        return pte;
 }
index 224f4bc..e78561b 100644 (file)
@@ -711,7 +711,6 @@ config CC_STACKPROTECTOR
 config SMP
        bool "Symmetric multi-processing support"
        depends on SYS_SUPPORTS_SMP
-       select USE_GENERIC_SMP_HELPERS
        ---help---
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, like most personal computers, say N. If
index 8c00785..a33673b 100644 (file)
@@ -47,7 +47,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
        if (!pg)
                return NULL;
        page = virt_to_page(pg);
-       pgtable_page_ctor(page);
+       if (!pgtable_page_ctor(page)) {
+               quicklist_free(QUICK_PT, NULL, pg);
+               return NULL;
+       }
        return page;
 }
 
index 78c4fdb..05fcfc6 100644 (file)
@@ -28,7 +28,6 @@ config SPARC
        select HAVE_ARCH_JUMP_LABEL
        select GENERIC_IRQ_SHOW
        select ARCH_WANT_IPC_PARSE_VERSION
-       select USE_GENERIC_SMP_HELPERS if SMP
        select GENERIC_PCI_IOMAP
        select HAVE_NMI_WATCHDOG if SPARC64
        select HAVE_BPF_JIT
@@ -64,6 +63,7 @@ config SPARC64
        select HAVE_DYNAMIC_FTRACE
        select HAVE_FTRACE_MCOUNT_RECORD
        select HAVE_SYSCALL_TRACEPOINTS
+       select HAVE_CONTEXT_TRACKING
        select HAVE_DEBUG_KMEMLEAK
        select RTC_DRV_CMOS
        select RTC_DRV_BQ4802
index 76092c4..f668797 100644 (file)
@@ -93,7 +93,6 @@ typedef struct {
        spinlock_t              lock;
        unsigned long           sparc64_ctx_val;
        unsigned long           huge_pte_count;
-       struct page             *pgtable_page;
        struct tsb_config       tsb_block[MM_NUM_TSBS];
        struct hv_tsb_descr     tsb_descr[MM_NUM_TSBS];
 } mm_context_t;
index e155388..aac53fc 100644 (file)
 #define DCACHE_ALIASING_POSSIBLE
 #endif
 
-#define HPAGE_SHIFT            22
+#define HPAGE_SHIFT            23
+#define REAL_HPAGE_SHIFT       22
+
+#define REAL_HPAGE_SIZE                (_AC(1,UL) << REAL_HPAGE_SHIFT)
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 #define HPAGE_SIZE             (_AC(1,UL) << HPAGE_SHIFT)
@@ -53,8 +56,8 @@ extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct pag
 /* These are used to make use of C type-checking.. */
 typedef struct { unsigned long pte; } pte_t;
 typedef struct { unsigned long iopte; } iopte_t;
-typedef struct { unsigned int pmd; } pmd_t;
-typedef struct { unsigned int pgd; } pgd_t;
+typedef struct { unsigned long pmd; } pmd_t;
+typedef struct { unsigned long pgd; } pgd_t;
 typedef struct { unsigned long pgprot; } pgprot_t;
 
 #define pte_val(x)     ((x).pte)
@@ -73,8 +76,8 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 /* .. while these make it easier on the compiler */
 typedef unsigned long pte_t;
 typedef unsigned long iopte_t;
-typedef unsigned int pmd_t;
-typedef unsigned int pgd_t;
+typedef unsigned long pmd_t;
+typedef unsigned long pgd_t;
 typedef unsigned long pgprot_t;
 
 #define pte_val(x)     (x)
@@ -93,18 +96,44 @@ typedef unsigned long pgprot_t;
 
 typedef pte_t *pgtable_t;
 
+/* These two values define the virtual address space range in which we
+ * must forbid 64-bit user processes from making mappings.  It used to
+ * represent precisely the virtual address space hole present in most
+ * early sparc64 chips including UltraSPARC-I.  But now it also is
+ * further constrained by the limits of our page tables, which is
+ * 43-bits of virtual address.
+ */
+#define SPARC64_VA_HOLE_TOP    _AC(0xfffffc0000000000,UL)
+#define SPARC64_VA_HOLE_BOTTOM _AC(0x0000040000000000,UL)
+
+/* The next two defines specify the actual exclusion region we
+ * enforce, wherein we use a 4GB red zone on each side of the VA hole.
+ */
+#define VA_EXCLUDE_START (SPARC64_VA_HOLE_BOTTOM - (1UL << 32UL))
+#define VA_EXCLUDE_END   (SPARC64_VA_HOLE_TOP + (1UL << 32UL))
+
 #define TASK_UNMAPPED_BASE     (test_thread_flag(TIF_32BIT) ? \
-                                (_AC(0x0000000070000000,UL)) : \
-                                (_AC(0xfffff80000000000,UL) + (1UL << 32UL)))
+                                _AC(0x0000000070000000,UL) : \
+                                VA_EXCLUDE_END)
 
 #include <asm-generic/memory_model.h>
 
+#define PAGE_OFFSET_BY_BITS(X) (-(_AC(1,UL) << (X)))
+extern unsigned long PAGE_OFFSET;
+
 #endif /* !(__ASSEMBLY__) */
 
-/* We used to stick this into a hard-coded global register (%g4)
- * but that does not make sense anymore.
+/* The maximum number of physical memory address bits we support, this
+ * is used to size various tables used to manage kernel TLB misses and
+ * also the sparsemem code.
+ */
+#define MAX_PHYS_ADDRESS_BITS  47
+
+/* These two shift counts are used when indexing sparc64_valid_addr_bitmap
+ * and kpte_linear_bitmap.
  */
-#define PAGE_OFFSET            _AC(0xFFFFF80000000000,UL)
+#define ILOG2_4MB              22
+#define ILOG2_256MB            28
 
 #ifndef __ASSEMBLY__
 
index 3676031..8358dc1 100644 (file)
 /* PMD_SHIFT determines the size of the area a second-level page
  * table can map
  */
-#define PMD_SHIFT      (PAGE_SHIFT + (PAGE_SHIFT-4))
+#define PMD_SHIFT      (PAGE_SHIFT + (PAGE_SHIFT-3))
 #define PMD_SIZE       (_AC(1,UL) << PMD_SHIFT)
 #define PMD_MASK       (~(PMD_SIZE-1))
-#define PMD_BITS       (PAGE_SHIFT - 2)
+#define PMD_BITS       (PAGE_SHIFT - 3)
 
 /* PGDIR_SHIFT determines what a third-level page table entry can map */
-#define PGDIR_SHIFT    (PAGE_SHIFT + (PAGE_SHIFT-4) + PMD_BITS)
+#define PGDIR_SHIFT    (PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS)
 #define PGDIR_SIZE     (_AC(1,UL) << PGDIR_SHIFT)
 #define PGDIR_MASK     (~(PGDIR_SIZE-1))
-#define PGDIR_BITS     (PAGE_SHIFT - 2)
+#define PGDIR_BITS     (PAGE_SHIFT - 3)
 
-#if (PGDIR_SHIFT + PGDIR_BITS) != 44
+#if (PGDIR_SHIFT + PGDIR_BITS) != 43
 #error Page table parameters do not cover virtual address space properly.
 #endif
 
 #error PMD_SHIFT must equal HPAGE_SHIFT for transparent huge pages.
 #endif
 
-/* PMDs point to PTE tables which are 4K aligned.  */
-#define PMD_PADDR      _AC(0xfffffffe,UL)
-#define PMD_PADDR_SHIFT        _AC(11,UL)
-
-#define PMD_ISHUGE     _AC(0x00000001,UL)
-
-/* This is the PMD layout when PMD_ISHUGE is set.  With 4MB huge
- * pages, this frees up a bunch of bits in the layout that we can
- * use for the protection settings and software metadata.
- */
-#define PMD_HUGE_PADDR         _AC(0xfffff800,UL)
-#define PMD_HUGE_PROTBITS      _AC(0x000007ff,UL)
-#define PMD_HUGE_PRESENT       _AC(0x00000400,UL)
-#define PMD_HUGE_WRITE         _AC(0x00000200,UL)
-#define PMD_HUGE_DIRTY         _AC(0x00000100,UL)
-#define PMD_HUGE_ACCESSED      _AC(0x00000080,UL)
-#define PMD_HUGE_EXEC          _AC(0x00000040,UL)
-#define PMD_HUGE_SPLITTING     _AC(0x00000020,UL)
-
-/* PGDs point to PMD tables which are 8K aligned.  */
-#define PGD_PADDR      _AC(0xfffffffc,UL)
-#define PGD_PADDR_SHIFT        _AC(11,UL)
-
 #ifndef __ASSEMBLY__
 
 #include <linux/sched.h>
 
 /* Entries per page directory level. */
-#define PTRS_PER_PTE   (1UL << (PAGE_SHIFT-4))
+#define PTRS_PER_PTE   (1UL << (PAGE_SHIFT-3))
 #define PTRS_PER_PMD   (1UL << PMD_BITS)
 #define PTRS_PER_PGD   (1UL << PGDIR_BITS)
 
 #define _PAGE_VALID      _AC(0x8000000000000000,UL) /* Valid TTE            */
 #define _PAGE_R                  _AC(0x8000000000000000,UL) /* Keep ref bit uptodate*/
 #define _PAGE_SPECIAL     _AC(0x0200000000000000,UL) /* Special page         */
+#define _PAGE_PMD_HUGE    _AC(0x0100000000000000,UL) /* Huge page            */
 
 /* Advertise support for _PAGE_SPECIAL */
 #define __HAVE_ARCH_PTE_SPECIAL
 #define _PAGE_IE_4U      _AC(0x0800000000000000,UL) /* Invert Endianness    */
 #define _PAGE_SOFT2_4U   _AC(0x07FC000000000000,UL) /* Software bits, set 2 */
 #define _PAGE_SPECIAL_4U  _AC(0x0200000000000000,UL) /* Special page         */
+#define _PAGE_PMD_HUGE_4U _AC(0x0100000000000000,UL) /* Huge page            */
 #define _PAGE_RES1_4U    _AC(0x0002000000000000,UL) /* Reserved             */
 #define _PAGE_SZ32MB_4U          _AC(0x0001000000000000,UL) /* (Panther) 32MB page  */
 #define _PAGE_SZ256MB_4U  _AC(0x2001000000000000,UL) /* (Panther) 256MB page */
 #define _PAGE_READ_4V    _AC(0x0800000000000000,UL) /* Readable SW Bit      */
 #define _PAGE_WRITE_4V   _AC(0x0400000000000000,UL) /* Writable SW Bit      */
 #define _PAGE_SPECIAL_4V  _AC(0x0200000000000000,UL) /* Special page         */
+#define _PAGE_PMD_HUGE_4V _AC(0x0100000000000000,UL) /* Huge page            */
 #define _PAGE_PADDR_4V   _AC(0x00FFFFFFFFFFE000,UL) /* paddr[55:13]         */
 #define _PAGE_IE_4V      _AC(0x0000000000001000,UL) /* Invert Endianness    */
 #define _PAGE_E_4V       _AC(0x0000000000000800,UL) /* side-Effect          */
 #define _PAGE_SZBITS_4U        _PAGE_SZ8K_4U
 #define _PAGE_SZBITS_4V        _PAGE_SZ8K_4V
 
+#if REAL_HPAGE_SHIFT != 22
+#error REAL_HPAGE_SHIFT and _PAGE_SZHUGE_foo must match up
+#endif
+
 #define _PAGE_SZHUGE_4U        _PAGE_SZ4MB_4U
 #define _PAGE_SZHUGE_4V        _PAGE_SZ4MB_4V
 
@@ -239,16 +223,13 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 #define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot);
-#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
-
-extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
-
-static inline pmd_t pmd_mkhuge(pmd_t pmd)
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 {
-       /* Do nothing, mk_pmd() does this part.  */
-       return pmd;
+       pte_t pte = pfn_pte(page_nr, pgprot);
+
+       return __pmd(pte_val(pte));
 }
+#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
 #endif
 
 /* This one can be done with two shifts.  */
@@ -309,14 +290,25 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot)
        : "=r" (mask), "=r" (tmp)
        : "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U |
               _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U | _PAGE_PRESENT_4U |
-              _PAGE_SPECIAL),
+              _PAGE_SPECIAL | _PAGE_PMD_HUGE | _PAGE_SZALL_4U),
          "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V |
               _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V | _PAGE_PRESENT_4V |
-              _PAGE_SPECIAL));
+              _PAGE_SPECIAL | _PAGE_PMD_HUGE | _PAGE_SZALL_4V));
 
        return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask));
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+       pte_t pte = __pte(pmd_val(pmd));
+
+       pte = pte_modify(pte, newprot);
+
+       return __pmd(pte_val(pte));
+}
+#endif
+
 static inline pte_t pgoff_to_pte(unsigned long off)
 {
        off <<= PAGE_SHIFT;
@@ -357,7 +349,7 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot)
  */
 #define pgprot_noncached pgprot_noncached
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline pte_t pte_mkhuge(pte_t pte)
 {
        unsigned long mask;
@@ -375,6 +367,17 @@ static inline pte_t pte_mkhuge(pte_t pte)
 
        return __pte(pte_val(pte) | mask);
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+       pte_t pte = __pte(pmd_val(pmd));
+
+       pte = pte_mkhuge(pte);
+       pte_val(pte) |= _PAGE_PMD_HUGE;
+
+       return __pmd(pte_val(pte));
+}
+#endif
 #endif
 
 static inline pte_t pte_mkdirty(pte_t pte)
@@ -626,91 +629,130 @@ static inline unsigned long pte_special(pte_t pte)
        return pte_val(pte) & _PAGE_SPECIAL;
 }
 
-static inline int pmd_large(pmd_t pmd)
+static inline unsigned long pmd_large(pmd_t pmd)
 {
-       return (pmd_val(pmd) & (PMD_ISHUGE | PMD_HUGE_PRESENT)) ==
-               (PMD_ISHUGE | PMD_HUGE_PRESENT);
+       pte_t pte = __pte(pmd_val(pmd));
+
+       return (pte_val(pte) & _PAGE_PMD_HUGE) && pte_present(pte);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_young(pmd_t pmd)
+static inline unsigned long pmd_young(pmd_t pmd)
 {
-       return pmd_val(pmd) & PMD_HUGE_ACCESSED;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       return pte_young(pte);
 }
 
-static inline int pmd_write(pmd_t pmd)
+static inline unsigned long pmd_write(pmd_t pmd)
 {
-       return pmd_val(pmd) & PMD_HUGE_WRITE;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       return pte_write(pte);
 }
 
 static inline unsigned long pmd_pfn(pmd_t pmd)
 {
-       unsigned long val = pmd_val(pmd) & PMD_HUGE_PADDR;
+       pte_t pte = __pte(pmd_val(pmd));
 
-       return val >> (PAGE_SHIFT - PMD_PADDR_SHIFT);
+       return pte_pfn(pte);
 }
 
-static inline int pmd_trans_splitting(pmd_t pmd)
+static inline unsigned long pmd_trans_huge(pmd_t pmd)
 {
-       return (pmd_val(pmd) & (PMD_ISHUGE|PMD_HUGE_SPLITTING)) ==
-               (PMD_ISHUGE|PMD_HUGE_SPLITTING);
+       pte_t pte = __pte(pmd_val(pmd));
+
+       return pte_val(pte) & _PAGE_PMD_HUGE;
 }
 
-static inline int pmd_trans_huge(pmd_t pmd)
+static inline unsigned long pmd_trans_splitting(pmd_t pmd)
 {
-       return pmd_val(pmd) & PMD_ISHUGE;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       return pmd_trans_huge(pmd) && pte_special(pte);
 }
 
 #define has_transparent_hugepage() 1
 
 static inline pmd_t pmd_mkold(pmd_t pmd)
 {
-       pmd_val(pmd) &= ~PMD_HUGE_ACCESSED;
-       return pmd;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       pte = pte_mkold(pte);
+
+       return __pmd(pte_val(pte));
 }
 
 static inline pmd_t pmd_wrprotect(pmd_t pmd)
 {
-       pmd_val(pmd) &= ~PMD_HUGE_WRITE;
-       return pmd;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       pte = pte_wrprotect(pte);
+
+       return __pmd(pte_val(pte));
 }
 
 static inline pmd_t pmd_mkdirty(pmd_t pmd)
 {
-       pmd_val(pmd) |= PMD_HUGE_DIRTY;
-       return pmd;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       pte = pte_mkdirty(pte);
+
+       return __pmd(pte_val(pte));
 }
 
 static inline pmd_t pmd_mkyoung(pmd_t pmd)
 {
-       pmd_val(pmd) |= PMD_HUGE_ACCESSED;
-       return pmd;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       pte = pte_mkyoung(pte);
+
+       return __pmd(pte_val(pte));
 }
 
 static inline pmd_t pmd_mkwrite(pmd_t pmd)
 {
-       pmd_val(pmd) |= PMD_HUGE_WRITE;
-       return pmd;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       pte = pte_mkwrite(pte);
+
+       return __pmd(pte_val(pte));
 }
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-       pmd_val(pmd) &= ~PMD_HUGE_PRESENT;
+       unsigned long mask;
+
+       if (tlb_type == hypervisor)
+               mask = _PAGE_PRESENT_4V;
+       else
+               mask = _PAGE_PRESENT_4U;
+
+       pmd_val(pmd) &= ~mask;
+
        return pmd;
 }
 
 static inline pmd_t pmd_mksplitting(pmd_t pmd)
 {
-       pmd_val(pmd) |= PMD_HUGE_SPLITTING;
-       return pmd;
+       pte_t pte = __pte(pmd_val(pmd));
+
+       pte = pte_mkspecial(pte);
+
+       return __pmd(pte_val(pte));
 }
 
-extern pgprot_t pmd_pgprot(pmd_t entry);
+static inline pgprot_t pmd_pgprot(pmd_t entry)
+{
+       unsigned long val = pmd_val(entry);
+
+       return __pgprot(val);
+}
 #endif
 
 static inline int pmd_present(pmd_t pmd)
 {
-       return pmd_val(pmd) != 0U;
+       return pmd_val(pmd) != 0UL;
 }
 
 #define pmd_none(pmd)                  (!pmd_val(pmd))
@@ -728,33 +770,32 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 
 static inline void pmd_set(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
 {
-       unsigned long val = __pa((unsigned long) (ptep)) >> PMD_PADDR_SHIFT;
+       unsigned long val = __pa((unsigned long) (ptep));
 
        pmd_val(*pmdp) = val;
 }
 
 #define pud_set(pudp, pmdp)    \
-       (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> PGD_PADDR_SHIFT))
+       (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp))))
 static inline unsigned long __pmd_page(pmd_t pmd)
 {
-       unsigned long paddr = (unsigned long) pmd_val(pmd);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       if (pmd_val(pmd) & PMD_ISHUGE)
-               paddr &= PMD_HUGE_PADDR;
-#endif
-       paddr <<= PMD_PADDR_SHIFT;
-       return ((unsigned long) __va(paddr));
+       pte_t pte = __pte(pmd_val(pmd));
+       unsigned long pfn;
+
+       pfn = pte_pfn(pte);
+
+       return ((unsigned long) __va(pfn << PAGE_SHIFT));
 }
 #define pmd_page(pmd)                  virt_to_page((void *)__pmd_page(pmd))
 #define pud_page_vaddr(pud)            \
-       ((unsigned long) __va((((unsigned long)pud_val(pud))<<PGD_PADDR_SHIFT)))
+       ((unsigned long) __va(pud_val(pud)))
 #define pud_page(pud)                  virt_to_page((void *)pud_page_vaddr(pud))
 #define pmd_bad(pmd)                   (0)
-#define pmd_clear(pmdp)                        (pmd_val(*(pmdp)) = 0U)
+#define pmd_clear(pmdp)                        (pmd_val(*(pmdp)) = 0UL)
 #define pud_none(pud)                  (!pud_val(pud))
 #define pud_bad(pud)                   (0)
 #define pud_present(pud)               (pud_val(pud) != 0U)
-#define pud_clear(pudp)                        (pud_val(*(pudp)) = 0U)
+#define pud_clear(pudp)                        (pud_val(*(pudp)) = 0UL)
 
 /* Same in both SUN4V and SUN4U.  */
 #define pte_none(pte)                  (!pte_val(pte))
@@ -789,7 +830,7 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
                                       pmd_t *pmdp)
 {
        pmd_t pmd = *pmdp;
-       set_pmd_at(mm, addr, pmdp, __pmd(0U));
+       set_pmd_at(mm, addr, pmdp, __pmd(0UL));
        return pmd;
 }
 
@@ -837,8 +878,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 })
 #endif
 
-extern pgd_t swapper_pg_dir[2048];
-extern pmd_t swapper_low_pmd_dir[2048];
+extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
+extern pmd_t swapper_low_pmd_dir[PTRS_PER_PMD];
 
 extern void paging_init(void);
 extern unsigned long find_ecache_flush_span(unsigned long size);
index b99d4e4..e5e1752 100644 (file)
@@ -3,9 +3,11 @@
 
 #ifdef __KERNEL__
 
+#include <asm/page.h>
+
 #define SECTION_SIZE_BITS       30
-#define MAX_PHYSADDR_BITS       42
-#define MAX_PHYSMEM_BITS        42
+#define MAX_PHYSADDR_BITS       MAX_PHYS_ADDRESS_BITS
+#define MAX_PHYSMEM_BITS        MAX_PHYS_ADDRESS_BITS
 
 #endif /* !(__KERNEL__) */
 
index d5e5042..5d9292a 100644 (file)
@@ -192,7 +192,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define TIF_UNALIGNED          5       /* allowed to do unaligned accesses */
 /* flag bit 6 is available */
 #define TIF_32BIT              7       /* 32-bit binary */
-/* flag bit 8 is available */
+#define TIF_NOHZ               8       /* in adaptive nohz mode */
 #define TIF_SECCOMP            9       /* secure computing */
 #define TIF_SYSCALL_AUDIT      10      /* syscall auditing active */
 #define TIF_SYSCALL_TRACEPOINT 11      /* syscall tracepoint instrumentation */
@@ -210,6 +210,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define _TIF_NEED_RESCHED      (1<<TIF_NEED_RESCHED)
 #define _TIF_UNALIGNED         (1<<TIF_UNALIGNED)
 #define _TIF_32BIT             (1<<TIF_32BIT)
+#define _TIF_NOHZ              (1<<TIF_NOHZ)
 #define _TIF_SECCOMP           (1<<TIF_SECCOMP)
 #define _TIF_SYSCALL_AUDIT     (1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
index e696432..2230f80 100644 (file)
@@ -142,98 +142,39 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
        or              REG1, %lo(swapper_pg_dir), REG1; \
        sllx            VADDR, 64 - (PGDIR_SHIFT + PGDIR_BITS), REG2; \
        srlx            REG2, 64 - PAGE_SHIFT, REG2; \
-       andn            REG2, 0x3, REG2; \
-       lduw            [REG1 + REG2], REG1; \
+       andn            REG2, 0x7, REG2; \
+       ld            [REG1 + REG2], REG1; \
        brz,pn          REG1, FAIL_LABEL; \
         sllx           VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
        srlx            REG2, 64 - PAGE_SHIFT, REG2; \
-       sllx            REG1, PGD_PADDR_SHIFT, REG1; \
-       andn            REG2, 0x3, REG2; \
-       lduwa           [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+       andn            REG2, 0x7, REG2; \
+       ldxa            [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
        brz,pn          REG1, FAIL_LABEL; \
         sllx           VADDR, 64 - PMD_SHIFT, REG2; \
-       srlx            REG2, 64 - (PAGE_SHIFT - 1), REG2; \
-       sllx            REG1, PMD_PADDR_SHIFT, REG1; \
+       srlx            REG2, 64 - PAGE_SHIFT, REG2; \
        andn            REG2, 0x7, REG2; \
        add             REG1, REG2, REG1;
 
-       /* These macros exists only to make the PMD translator below
-        * easier to read.  It hides the ELF section switch for the
-        * sun4v code patching.
-        */
-#define OR_PTE_BIT_1INSN(REG, NAME)                    \
-661:   or              REG, _PAGE_##NAME##_4U, REG;    \
-       .section        .sun4v_1insn_patch, "ax";       \
-       .word           661b;                           \
-       or              REG, _PAGE_##NAME##_4V, REG;    \
-       .previous;
-
-#define OR_PTE_BIT_2INSN(REG, TMP, NAME)               \
-661:   sethi           %hi(_PAGE_##NAME##_4U), TMP;    \
-       or              REG, TMP, REG;                  \
-       .section        .sun4v_2insn_patch, "ax";       \
-       .word           661b;                           \
-       mov             -1, TMP;                        \
-       or              REG, _PAGE_##NAME##_4V, REG;    \
-       .previous;
-
-       /* Load into REG the PTE value for VALID, CACHE, and SZHUGE.  */
-#define BUILD_PTE_VALID_SZHUGE_CACHE(REG)                                 \
-661:   sethi           %uhi(_PAGE_VALID|_PAGE_SZHUGE_4U), REG;            \
-       .section        .sun4v_1insn_patch, "ax";                          \
-       .word           661b;                                              \
-       sethi           %uhi(_PAGE_VALID), REG;                            \
-       .previous;                                                         \
-       sllx            REG, 32, REG;                                      \
-661:   or              REG, _PAGE_CP_4U|_PAGE_CV_4U, REG;                 \
-       .section        .sun4v_1insn_patch, "ax";                          \
-       .word           661b;                                              \
-       or              REG, _PAGE_CP_4V|_PAGE_CV_4V|_PAGE_SZHUGE_4V, REG; \
-       .previous;
-
        /* PMD has been loaded into REG1, interpret the value, seeing
         * if it is a HUGE PMD or a normal one.  If it is not valid
         * then jump to FAIL_LABEL.  If it is a HUGE PMD, and it
         * translates to a valid PTE, branch to PTE_LABEL.
         *
-        * We translate the PMD by hand, one bit at a time,
-        * constructing the huge PTE.
-        *
-        * So we construct the PTE in REG2 as follows:
-        *
-        * 1) Extract the PMD PFN from REG1 and place it into REG2.
-        *
-        * 2) Translate PMD protection bits in REG1 into REG2, one bit
-        *    at a time using andcc tests on REG1 and OR's into REG2.
-        *
-        *    Only two bits to be concerned with here, EXEC and WRITE.
-        *    Now REG1 is freed up and we can use it as a temporary.
-        *
-        * 3) Construct the VALID, CACHE, and page size PTE bits in
-        *    REG1, OR with REG2 to form final PTE.
+        * We have to propagate the 4MB bit of the virtual address
+        * because we are fabricating 8MB pages using 4MB hw pages.
         */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
-       brz,pn          REG1, FAIL_LABEL;                                     \
-        andcc          REG1, PMD_ISHUGE, %g0;                                \
-       be,pt           %xcc, 700f;                                           \
-        and            REG1, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED, REG2;       \
-       cmp             REG2, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED;             \
-       bne,pn          %xcc, FAIL_LABEL;                                     \
-        andn           REG1, PMD_HUGE_PROTBITS, REG2;                        \
-       sllx            REG2, PMD_PADDR_SHIFT, REG2;                          \
-       /* REG2 now holds PFN << PAGE_SHIFT */                                \
-       andcc           REG1, PMD_HUGE_WRITE, %g0;                            \
-       bne,a,pt        %xcc, 1f;                                             \
-        OR_PTE_BIT_1INSN(REG2, W);                                           \
-1:     andcc           REG1, PMD_HUGE_EXEC, %g0;                             \
-       be,pt           %xcc, 1f;                                             \
-        nop;                                                                 \
-       OR_PTE_BIT_2INSN(REG2, REG1, EXEC);                                   \
-       /* REG1 can now be clobbered, build final PTE */                      \
-1:     BUILD_PTE_VALID_SZHUGE_CACHE(REG1);                                   \
-       ba,pt           %xcc, PTE_LABEL;                                      \
-        or             REG1, REG2, REG1;                                     \
+       brz,pn          REG1, FAIL_LABEL;               \
+        sethi          %uhi(_PAGE_PMD_HUGE), REG2;     \
+       sllx            REG2, 32, REG2;                 \
+       andcc           REG1, REG2, %g0;                \
+       be,pt           %xcc, 700f;                     \
+        sethi          %hi(4 * 1024 * 1024), REG2;     \
+       andn            REG1, REG2, REG1;               \
+       and             VADDR, REG2, REG2;              \
+       brlz,pt         REG1, PTE_LABEL;                \
+        or             REG1, REG2, REG1;               \
 700:
 #else
 #define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
@@ -253,18 +194,16 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 #define USER_PGTABLE_WALK_TL1(VADDR, PHYS_PGD, REG1, REG2, FAIL_LABEL) \
        sllx            VADDR, 64 - (PGDIR_SHIFT + PGDIR_BITS), REG2; \
        srlx            REG2, 64 - PAGE_SHIFT, REG2; \
-       andn            REG2, 0x3, REG2; \
-       lduwa           [PHYS_PGD + REG2] ASI_PHYS_USE_EC, REG1; \
+       andn            REG2, 0x7, REG2; \
+       ldxa            [PHYS_PGD + REG2] ASI_PHYS_USE_EC, REG1; \
        brz,pn          REG1, FAIL_LABEL; \
         sllx           VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
        srlx            REG2, 64 - PAGE_SHIFT, REG2; \
-       sllx            REG1, PGD_PADDR_SHIFT, REG1; \
-       andn            REG2, 0x3, REG2; \
-       lduwa           [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+       andn            REG2, 0x7, REG2; \
+       ldxa            [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
        USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, 800f) \
        sllx            VADDR, 64 - PMD_SHIFT, REG2; \
-       srlx            REG2, 64 - (PAGE_SHIFT - 1), REG2; \
-       sllx            REG1, PMD_PADDR_SHIFT, REG1; \
+       srlx            REG2, 64 - PAGE_SHIFT, REG2; \
        andn            REG2, 0x7, REG2; \
        add             REG1, REG2, REG1; \
        ldxa            [REG1] ASI_PHYS_USE_EC, REG1; \
index 9c179fb..140966f 100644 (file)
@@ -88,7 +88,6 @@ extern asmlinkage void syscall_trace_leave(struct pt_regs *regs);
 
 extern void bad_trap_tl1(struct pt_regs *regs, long lvl);
 
-extern void do_fpe_common(struct pt_regs *regs);
 extern void do_fpieee(struct pt_regs *regs);
 extern void do_fpother(struct pt_regs *regs);
 extern void do_tof(struct pt_regs *regs);
index 53c0a82..60b19f5 100644 (file)
@@ -159,11 +159,12 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 
 asmlinkage void kgdb_trap(unsigned long trap_level, struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
        unsigned long flags;
 
        if (user_mode(regs)) {
                bad_trap(regs, trap_level);
-               return;
+               goto out;
        }
 
        flushw_all();
@@ -171,6 +172,8 @@ asmlinkage void kgdb_trap(unsigned long trap_level, struct pt_regs *regs)
        local_irq_save(flags);
        kgdb_handle_exception(0x172, SIGTRAP, 0, regs);
        local_irq_restore(flags);
+out:
+       exception_exit(prev_state);
 }
 
 int kgdb_arch_init(void)
index e722121..5a09fd3 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/slab.h>
+#include <linux/context_tracking.h>
 #include <asm/signal.h>
 #include <asm/cacheflush.h>
 #include <asm/uaccess.h>
@@ -418,12 +419,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 asmlinkage void __kprobes kprobe_trap(unsigned long trap_level,
                                      struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
+
        BUG_ON(trap_level != 0x170 && trap_level != 0x171);
 
        if (user_mode(regs)) {
                local_irq_enable();
                bad_trap(regs, trap_level);
-               return;
+               goto out;
        }
 
        /* trap_level == 0x170 --> ta 0x70
@@ -433,6 +436,8 @@ asmlinkage void __kprobes kprobe_trap(unsigned long trap_level,
                       (trap_level == 0x170) ? "debug" : "debug_2",
                       regs, 0, trap_level, SIGTRAP) != NOTIFY_STOP)
                bad_trap(regs, trap_level);
+out:
+       exception_exit(prev_state);
 }
 
 /* Jprobes support.  */
index fde5a41..542e96a 100644 (file)
@@ -153,12 +153,19 @@ kvmap_dtlb_tsb4m_miss:
        /* Clear the PAGE_OFFSET top virtual bits, shift
         * down to get PFN, and make sure PFN is in range.
         */
-       sllx            %g4, 21, %g5
+661:   sllx            %g4, 0, %g5
+       .section        .page_offset_shift_patch, "ax"
+       .word           661b
+       .previous
 
        /* Check to see if we know about valid memory at the 4MB
         * chunk this physical address will reside within.
         */
-       srlx            %g5, 21 + 41, %g2
+661:   srlx            %g5, MAX_PHYS_ADDRESS_BITS, %g2
+       .section        .page_offset_shift_patch, "ax"
+       .word           661b
+       .previous
+
        brnz,pn         %g2, kvmap_dtlb_longpath
         nop
 
@@ -176,7 +183,11 @@ valid_addr_bitmap_patch:
        or              %g7, %lo(sparc64_valid_addr_bitmap), %g7
        .previous
 
-       srlx            %g5, 21 + 22, %g2
+661:   srlx            %g5, ILOG2_4MB, %g2
+       .section        .page_offset_shift_patch, "ax"
+       .word           661b
+       .previous
+
        srlx            %g2, 6, %g5
        and             %g2, 63, %g2
        sllx            %g5, 3, %g5
@@ -189,9 +200,18 @@ valid_addr_bitmap_patch:
 2:      sethi          %hi(kpte_linear_bitmap), %g2
 
        /* Get the 256MB physical address index. */
-       sllx            %g4, 21, %g5
+661:   sllx            %g4, 0, %g5
+       .section        .page_offset_shift_patch, "ax"
+       .word           661b
+       .previous
+
        or              %g2, %lo(kpte_linear_bitmap), %g2
-       srlx            %g5, 21 + 28, %g5
+
+661:   srlx            %g5, ILOG2_256MB, %g5
+       .section        .page_offset_shift_patch, "ax"
+       .word           661b
+       .previous
+
        and             %g5, (32 - 1), %g7
 
        /* Divide by 32 to get the offset into the bitmask.  */
index bc4d3f5..cb02145 100644 (file)
@@ -398,8 +398,8 @@ static void apb_fake_ranges(struct pci_dev *dev,
        apb_calc_first_last(map, &first, &last);
        res = bus->resource[1];
        res->flags = IORESOURCE_MEM;
-       region.start = (first << 21);
-       region.end = (last << 21) + ((1 << 21) - 1);
+       region.start = (first << 29);
+       region.end = (last << 29) + ((1 << 29) - 1);
        pcibios_bus_to_resource(dev, res, &region);
 }
 
index baebab2..32a280e 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/elfcore.h>
 #include <linux/sysrq.h>
 #include <linux/nmi.h>
+#include <linux/context_tracking.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -557,6 +558,7 @@ void fault_in_user_windows(void)
 
 barf:
        set_thread_wsaved(window + 1);
+       user_exit();
        do_exit(SIGILL);
 }
 
index 773c1f2..c13c9f2 100644 (file)
@@ -27,6 +27,7 @@
 #include <trace/syscall.h>
 #include <linux/compat.h>
 #include <linux/elf.h>
+#include <linux/context_tracking.h>
 
 #include <asm/asi.h>
 #include <asm/pgtable.h>
@@ -1066,6 +1067,9 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
        /* do the secure computing check first */
        secure_computing_strict(regs->u_regs[UREG_G1]);
 
+       if (test_thread_flag(TIF_NOHZ))
+               user_exit();
+
        if (test_thread_flag(TIF_SYSCALL_TRACE))
                ret = tracehook_report_syscall_entry(regs);
 
@@ -1086,6 +1090,9 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
+       if (test_thread_flag(TIF_NOHZ))
+               user_exit();
+
        audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
@@ -1093,4 +1100,7 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 
        if (test_thread_flag(TIF_SYSCALL_TRACE))
                tracehook_report_syscall_exit(regs, 0);
+
+       if (test_thread_flag(TIF_NOHZ))
+               user_enter();
 }
index afa2a9e..a954eb8 100644 (file)
 #define                RTRAP_PSTATE_IRQOFF     (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV)
 #define                RTRAP_PSTATE_AG_IRQOFF  (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG)
 
+#ifdef CONFIG_CONTEXT_TRACKING
+# define SCHEDULE_USER schedule_user
+#else
+# define SCHEDULE_USER schedule
+#endif
+
                .text
                .align                  32
 __handle_preemption:
-               call                    schedule
+               call                    SCHEDULE_USER
                 wrpr                   %g0, RTRAP_PSTATE, %pstate
                ba,pt                   %xcc, __handle_preemption_continue
                 wrpr                   %g0, RTRAP_PSTATE_IRQOFF, %pstate
index 35923e8..cd91d01 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/bitops.h>
+#include <linux/context_tracking.h>
 
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
@@ -43,6 +44,7 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs)
 {
        struct ucontext __user *ucp = (struct ucontext __user *)
                regs->u_regs[UREG_I0];
+       enum ctx_state prev_state = exception_enter();
        mc_gregset_t __user *grp;
        unsigned long pc, npc, tstate;
        unsigned long fp, i7;
@@ -129,16 +131,19 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs)
        }
        if (err)
                goto do_sigsegv;
-
+out:
+       exception_exit(prev_state);
        return;
 do_sigsegv:
        force_sig(SIGSEGV, current);
+       goto out;
 }
 
 asmlinkage void sparc64_get_context(struct pt_regs *regs)
 {
        struct ucontext __user *ucp = (struct ucontext __user *)
                regs->u_regs[UREG_I0];
+       enum ctx_state prev_state = exception_enter();
        mc_gregset_t __user *grp;
        mcontext_t __user *mcp;
        unsigned long fp, i7;
@@ -220,10 +225,12 @@ asmlinkage void sparc64_get_context(struct pt_regs *regs)
        }
        if (err)
                goto do_sigsegv;
-
+out:
+       exception_exit(prev_state);
        return;
 do_sigsegv:
        force_sig(SIGSEGV, current);
+       goto out;
 }
 
 struct rt_signal_frame {
@@ -528,11 +535,13 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 
 void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags)
 {
+       user_exit();
        if (thread_info_flags & _TIF_SIGPENDING)
                do_signal(regs, orig_i0);
        if (thread_info_flags & _TIF_NOTIFY_RESUME) {
                clear_thread_flag(TIF_NOTIFY_RESUME);
                tracehook_notify_resume(regs);
        }
+       user_enter();
 }
 
index e142545..b66a533 100644 (file)
@@ -1399,8 +1399,13 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 void smp_send_reschedule(int cpu)
 {
-       xcall_deliver((u64) &xcall_receive_signal, 0, 0,
-                     cpumask_of(cpu));
+       if (cpu == smp_processor_id()) {
+               WARN_ON_ONCE(preemptible());
+               set_softint(1 << PIL_SMP_RECEIVE_SIGNAL);
+       } else {
+               xcall_deliver((u64) &xcall_receive_signal,
+                             0, 0, cpumask_of(cpu));
+       }
 }
 
 void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
index bde867f..e0c09bf 100644 (file)
@@ -182,7 +182,7 @@ sun4v_tsb_miss_common:
        cmp     %g5, -1
        be,pt   %xcc, 80f
         nop
-       COMPUTE_TSB_PTR(%g5, %g4, HPAGE_SHIFT, %g2, %g7)
+       COMPUTE_TSB_PTR(%g5, %g4, REAL_HPAGE_SHIFT, %g2, %g7)
 
        /* That clobbered %g2, reload it.  */
        ldxa    [%g0] ASI_SCRATCHPAD, %g2
index 51561b8..beb0b5a 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/personality.h>
 #include <linux/random.h>
 #include <linux/export.h>
+#include <linux/context_tracking.h>
 
 #include <asm/uaccess.h>
 #include <asm/utrap.h>
@@ -39,9 +40,6 @@ asmlinkage unsigned long sys_getpagesize(void)
        return PAGE_SIZE;
 }
 
-#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL))
-#define VA_EXCLUDE_END   (0xfffff80000000000UL + (1UL << 32UL))
-
 /* Does addr --> addr+len fall within 4GB of the VA-space hole or
  * overflow past the end of the 64-bit address space?
  */
@@ -499,6 +497,7 @@ asmlinkage unsigned long c_sys_nis_syscall(struct pt_regs *regs)
 
 asmlinkage void sparc_breakpoint(struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
        siginfo_t info;
 
        if (test_thread_flag(TIF_32BIT)) {
@@ -517,6 +516,7 @@ asmlinkage void sparc_breakpoint(struct pt_regs *regs)
 #ifdef DEBUG_SPARC_BREAKPOINT
        printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc);
 #endif
+       exception_exit(prev_state);
 }
 
 extern void check_pending(int signum);
index d950197..87729ff 100644 (file)
@@ -52,7 +52,7 @@ sys32_rt_sigreturn:
 #endif
        .align  32
 1:     ldx     [%g6 + TI_FLAGS], %l5
-       andcc   %l5, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0
+       andcc   %l5, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT|_TIF_NOHZ), %g0
        be,pt   %icc, rtrap
         nop
        call    syscall_trace_leave
@@ -184,7 +184,7 @@ linux_sparc_syscall32:
 
        srl     %i3, 0, %o3                             ! IEU0
        srl     %i2, 0, %o2                             ! IEU0  Group
-       andcc   %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0
+       andcc   %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT|_TIF_NOHZ), %g0
        bne,pn  %icc, linux_syscall_trace32             ! CTI
         mov    %i0, %l5                                ! IEU1
 5:     call    %l7                                     ! CTI   Group brk forced
@@ -207,7 +207,7 @@ linux_sparc_syscall:
 
        mov     %i3, %o3                                ! IEU1
        mov     %i4, %o4                                ! IEU0  Group
-       andcc   %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0
+       andcc   %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT|_TIF_NOHZ), %g0
        bne,pn  %icc, linux_syscall_trace               ! CTI   Group
         mov    %i0, %l5                                ! IEU0
 2:     call    %l7                                     ! CTI   Group brk forced
@@ -223,7 +223,7 @@ ret_sys_call:
 
        cmp     %o0, -ERESTART_RESTARTBLOCK
        bgeu,pn %xcc, 1f
-        andcc  %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0
+        andcc  %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT|_TIF_NOHZ), %g0
        ldx     [%sp + PTREGS_OFF + PT_V9_TNPC], %l1 ! pc = npc
 
 2:
index b3f833a..4ced92f 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/ftrace.h>
 #include <linux/reboot.h>
 #include <linux/gfp.h>
+#include <linux/context_tracking.h>
 
 #include <asm/smp.h>
 #include <asm/delay.h>
@@ -186,11 +187,12 @@ EXPORT_SYMBOL_GPL(unregister_dimm_printer);
 
 void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
 {
+       enum ctx_state prev_state = exception_enter();
        siginfo_t info;
 
        if (notify_die(DIE_TRAP, "instruction access exception", regs,
                       0, 0x8, SIGTRAP) == NOTIFY_STOP)
-               return;
+               goto out;
 
        if (regs->tstate & TSTATE_PRIV) {
                printk("spitfire_insn_access_exception: SFSR[%016lx] "
@@ -207,6 +209,8 @@ void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, un
        info.si_addr = (void __user *)regs->tpc;
        info.si_trapno = 0;
        force_sig_info(SIGSEGV, &info, current);
+out:
+       exception_exit(prev_state);
 }
 
 void spitfire_insn_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
@@ -260,11 +264,12 @@ void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, u
 
 void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
 {
+       enum ctx_state prev_state = exception_enter();
        siginfo_t info;
 
        if (notify_die(DIE_TRAP, "data access exception", regs,
                       0, 0x30, SIGTRAP) == NOTIFY_STOP)
-               return;
+               goto out;
 
        if (regs->tstate & TSTATE_PRIV) {
                /* Test if this comes from uaccess places. */
@@ -280,7 +285,7 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un
 #endif
                        regs->tpc = entry->fixup;
                        regs->tnpc = regs->tpc + 4;
-                       return;
+                       goto out;
                }
                /* Shit... */
                printk("spitfire_data_access_exception: SFSR[%016lx] "
@@ -294,6 +299,8 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un
        info.si_addr = (void __user *)sfar;
        info.si_trapno = 0;
        force_sig_info(SIGSEGV, &info, current);
+out:
+       exception_exit(prev_state);
 }
 
 void spitfire_data_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
@@ -1994,6 +2001,7 @@ static void sun4v_log_error(struct pt_regs *regs, struct sun4v_error_entry *ent,
  */
 void sun4v_resum_error(struct pt_regs *regs, unsigned long offset)
 {
+       enum ctx_state prev_state = exception_enter();
        struct sun4v_error_entry *ent, local_copy;
        struct trap_per_cpu *tb;
        unsigned long paddr;
@@ -2022,12 +2030,14 @@ void sun4v_resum_error(struct pt_regs *regs, unsigned long offset)
                pr_info("Shutdown request, %u seconds...\n",
                        local_copy.err_secs);
                orderly_poweroff(true);
-               return;
+               goto out;
        }
 
        sun4v_log_error(regs, &local_copy, cpu,
                        KERN_ERR "RESUMABLE ERROR",
                        &sun4v_resum_oflow_cnt);
+out:
+       exception_exit(prev_state);
 }
 
 /* If we try to printk() we'll probably make matters worse, by trying
@@ -2152,7 +2162,7 @@ void hypervisor_tlbop_error_xcall(unsigned long err, unsigned long op)
               err, op);
 }
 
-void do_fpe_common(struct pt_regs *regs)
+static void do_fpe_common(struct pt_regs *regs)
 {
        if (regs->tstate & TSTATE_PRIV) {
                regs->tpc = regs->tnpc;
@@ -2188,23 +2198,28 @@ void do_fpe_common(struct pt_regs *regs)
 
 void do_fpieee(struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
+
        if (notify_die(DIE_TRAP, "fpu exception ieee", regs,
                       0, 0x24, SIGFPE) == NOTIFY_STOP)
-               return;
+               goto out;
 
        do_fpe_common(regs);
+out:
+       exception_exit(prev_state);
 }
 
 extern int do_mathemu(struct pt_regs *, struct fpustate *, bool);
 
 void do_fpother(struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
        struct fpustate *f = FPUSTATE;
        int ret = 0;
 
        if (notify_die(DIE_TRAP, "fpu exception other", regs,
                       0, 0x25, SIGFPE) == NOTIFY_STOP)
-               return;
+               goto out;
 
        switch ((current_thread_info()->xfsr[0] & 0x1c000)) {
        case (2 << 14): /* unfinished_FPop */
@@ -2213,17 +2228,20 @@ void do_fpother(struct pt_regs *regs)
                break;
        }
        if (ret)
-               return;
+               goto out;
        do_fpe_common(regs);
+out:
+       exception_exit(prev_state);
 }
 
 void do_tof(struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
        siginfo_t info;
 
        if (notify_die(DIE_TRAP, "tagged arithmetic overflow", regs,
                       0, 0x26, SIGEMT) == NOTIFY_STOP)
-               return;
+               goto out;
 
        if (regs->tstate & TSTATE_PRIV)
                die_if_kernel("Penguin overflow trap from kernel mode", regs);
@@ -2237,15 +2255,18 @@ void do_tof(struct pt_regs *regs)
        info.si_addr = (void __user *)regs->tpc;
        info.si_trapno = 0;
        force_sig_info(SIGEMT, &info, current);
+out:
+       exception_exit(prev_state);
 }
 
 void do_div0(struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
        siginfo_t info;
 
        if (notify_die(DIE_TRAP, "integer division by zero", regs,
                       0, 0x28, SIGFPE) == NOTIFY_STOP)
-               return;
+               goto out;
 
        if (regs->tstate & TSTATE_PRIV)
                die_if_kernel("TL0: Kernel divide by zero.", regs);
@@ -2259,6 +2280,8 @@ void do_div0(struct pt_regs *regs)
        info.si_addr = (void __user *)regs->tpc;
        info.si_trapno = 0;
        force_sig_info(SIGFPE, &info, current);
+out:
+       exception_exit(prev_state);
 }
 
 static void instruction_dump(unsigned int *pc)
@@ -2415,6 +2438,7 @@ extern int handle_ldf_stq(u32 insn, struct pt_regs *regs);
 
 void do_illegal_instruction(struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
        unsigned long pc = regs->tpc;
        unsigned long tstate = regs->tstate;
        u32 insn;
@@ -2422,7 +2446,7 @@ void do_illegal_instruction(struct pt_regs *regs)
 
        if (notify_die(DIE_TRAP, "illegal instruction", regs,
                       0, 0x10, SIGILL) == NOTIFY_STOP)
-               return;
+               goto out;
 
        if (tstate & TSTATE_PRIV)
                die_if_kernel("Kernel illegal instruction", regs);
@@ -2431,14 +2455,14 @@ void do_illegal_instruction(struct pt_regs *regs)
        if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
                if ((insn & 0xc1ffc000) == 0x81700000) /* POPC */ {
                        if (handle_popc(insn, regs))
-                               return;
+                               goto out;
                } else if ((insn & 0xc1580000) == 0xc1100000) /* LDQ/STQ */ {
                        if (handle_ldf_stq(insn, regs))
-                               return;
+                               goto out;
                } else if (tlb_type == hypervisor) {
                        if ((insn & VIS_OPCODE_MASK) == VIS_OPCODE_VAL) {
                                if (!vis_emul(regs, insn))
-                                       return;
+                                       goto out;
                        } else {
                                struct fpustate *f = FPUSTATE;
 
@@ -2448,7 +2472,7 @@ void do_illegal_instruction(struct pt_regs *regs)
                                 * Trap in the %fsr to unimplemented_FPop.
                                 */
                                if (do_mathemu(regs, f, true))
-                                       return;
+                                       goto out;
                        }
                }
        }
@@ -2458,21 +2482,24 @@ void do_illegal_instruction(struct pt_regs *regs)
        info.si_addr = (void __user *)pc;
        info.si_trapno = 0;
        force_sig_info(SIGILL, &info, current);
+out:
+       exception_exit(prev_state);
 }
 
 extern void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn);
 
 void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr)
 {
+       enum ctx_state prev_state = exception_enter();
        siginfo_t info;
 
        if (notify_die(DIE_TRAP, "memory address unaligned", regs,
                       0, 0x34, SIGSEGV) == NOTIFY_STOP)
-               return;
+               goto out;
 
        if (regs->tstate & TSTATE_PRIV) {
                kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc));
-               return;
+               goto out;
        }
        info.si_signo = SIGBUS;
        info.si_errno = 0;
@@ -2480,6 +2507,8 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
        info.si_addr = (void __user *)sfar;
        info.si_trapno = 0;
        force_sig_info(SIGBUS, &info, current);
+out:
+       exception_exit(prev_state);
 }
 
 void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
@@ -2504,11 +2533,12 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c
 
 void do_privop(struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
        siginfo_t info;
 
        if (notify_die(DIE_TRAP, "privileged operation", regs,
                       0, 0x11, SIGILL) == NOTIFY_STOP)
-               return;
+               goto out;
 
        if (test_thread_flag(TIF_32BIT)) {
                regs->tpc &= 0xffffffff;
@@ -2520,6 +2550,8 @@ void do_privop(struct pt_regs *regs)
        info.si_addr = (void __user *)regs->tpc;
        info.si_trapno = 0;
        force_sig_info(SIGILL, &info, current);
+out:
+       exception_exit(prev_state);
 }
 
 void do_privact(struct pt_regs *regs)
@@ -2530,99 +2562,116 @@ void do_privact(struct pt_regs *regs)
 /* Trap level 1 stuff or other traps we should never see... */
 void do_cee(struct pt_regs *regs)
 {
+       exception_enter();
        die_if_kernel("TL0: Cache Error Exception", regs);
 }
 
 void do_cee_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: Cache Error Exception", regs);
 }
 
 void do_dae_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: Data Access Exception", regs);
 }
 
 void do_iae_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: Instruction Access Exception", regs);
 }
 
 void do_div0_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: DIV0 Exception", regs);
 }
 
 void do_fpdis_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: FPU Disabled", regs);
 }
 
 void do_fpieee_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: FPU IEEE Exception", regs);
 }
 
 void do_fpother_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: FPU Other Exception", regs);
 }
 
 void do_ill_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: Illegal Instruction Exception", regs);
 }
 
 void do_irq_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: IRQ Exception", regs);
 }
 
 void do_lddfmna_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: LDDF Exception", regs);
 }
 
 void do_stdfmna_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: STDF Exception", regs);
 }
 
 void do_paw(struct pt_regs *regs)
 {
+       exception_enter();
        die_if_kernel("TL0: Phys Watchpoint Exception", regs);
 }
 
 void do_paw_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: Phys Watchpoint Exception", regs);
 }
 
 void do_vaw(struct pt_regs *regs)
 {
+       exception_enter();
        die_if_kernel("TL0: Virt Watchpoint Exception", regs);
 }
 
 void do_vaw_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: Virt Watchpoint Exception", regs);
 }
 
 void do_tof_tl1(struct pt_regs *regs)
 {
+       exception_enter();
        dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
        die_if_kernel("TL1: Tag Overflow Exception", regs);
 }
index a313e4a..14158d4 100644 (file)
@@ -75,7 +75,7 @@ tsb_miss_page_table_walk:
        mov             512, %g7
        andn            %g5, 0x7, %g5
        sllx            %g7, %g6, %g7
-       srlx            %g4, HPAGE_SHIFT, %g6
+       srlx            %g4, REAL_HPAGE_SHIFT, %g6
        sub             %g7, 1, %g7
        and             %g6, %g7, %g6
        sllx            %g6, 4, %g6
index 8201c25..3c1a7cb 100644 (file)
 #include <linux/bitops.h>
 #include <linux/perf_event.h>
 #include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
 #include <asm/fpumacro.h>
 #include <asm/cacheflush.h>
 
+#include "entry.h"
+
 enum direction {
        load,    /* ld, ldd, ldh, ldsh */
        store,   /* st, std, sth, stsh */
@@ -418,9 +421,6 @@ int handle_popc(u32 insn, struct pt_regs *regs)
 
 extern void do_fpother(struct pt_regs *regs);
 extern void do_privact(struct pt_regs *regs);
-extern void spitfire_data_access_exception(struct pt_regs *regs,
-                                          unsigned long sfsr,
-                                          unsigned long sfar);
 extern void sun4v_data_access_exception(struct pt_regs *regs,
                                        unsigned long addr,
                                        unsigned long type_ctx);
@@ -578,6 +578,7 @@ void handle_ld_nf(u32 insn, struct pt_regs *regs)
 
 void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr)
 {
+       enum ctx_state prev_state = exception_enter();
        unsigned long pc = regs->tpc;
        unsigned long tstate = regs->tstate;
        u32 insn;
@@ -632,13 +633,16 @@ daex:
                        sun4v_data_access_exception(regs, sfar, sfsr);
                else
                        spitfire_data_access_exception(regs, sfsr, sfar);
-               return;
+               goto out;
        }
        advance(regs);
+out:
+       exception_exit(prev_state);
 }
 
 void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr)
 {
+       enum ctx_state prev_state = exception_enter();
        unsigned long pc = regs->tpc;
        unsigned long tstate = regs->tstate;
        u32 insn;
@@ -680,7 +684,9 @@ daex:
                        sun4v_data_access_exception(regs, sfar, sfsr);
                else
                        spitfire_data_access_exception(regs, sfsr, sfar);
-               return;
+               goto out;
        }
        advance(regs);
+out:
+       exception_exit(prev_state);
 }
index 0bacceb..932ff90 100644 (file)
@@ -122,6 +122,11 @@ SECTIONS
                *(.swapper_4m_tsb_phys_patch)
                __swapper_4m_tsb_phys_patch_end = .;
        }
+       .page_offset_shift_patch : {
+               __page_offset_shift_patch = .;
+               *(.page_offset_shift_patch)
+               __page_offset_shift_patch_end = .;
+       }
        .popc_3insn_patch : {
                __popc_3insn_patch = .;
                *(.popc_3insn_patch)
index 77e531f..46272df 100644 (file)
@@ -37,10 +37,10 @@ _clear_page:                /* %o0=dest */
        .globl          clear_user_page
 clear_user_page:       /* %o0=dest, %o1=vaddr */
        lduw            [%g6 + TI_PRE_COUNT], %o2
-       sethi           %uhi(PAGE_OFFSET), %g2
+       sethi           %hi(PAGE_OFFSET), %g2
        sethi           %hi(PAGE_SIZE), %o4
 
-       sllx            %g2, 32, %g2
+       ldx             [%g2 + %lo(PAGE_OFFSET)], %g2
        sethi           %hi(PAGE_KERNEL_LOCKED), %g3
 
        ldx             [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
index 4d2df32..dd16c61 100644 (file)
        .type           copy_user_page,#function
 copy_user_page:                /* %o0=dest, %o1=src, %o2=vaddr */
        lduw            [%g6 + TI_PRE_COUNT], %o4
-       sethi           %uhi(PAGE_OFFSET), %g2
+       sethi           %hi(PAGE_OFFSET), %g2
        sethi           %hi(PAGE_SIZE), %o3
 
-       sllx            %g2, 32, %g2
+       ldx             [%g2 + %lo(PAGE_OFFSET)], %g2
        sethi           %hi(PAGE_KERNEL_LOCKED), %g3
 
        ldx             [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
index 2ebec26..69bb818 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/percpu.h>
+#include <linux/context_tracking.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -272,6 +273,7 @@ static void noinline __kprobes bogus_32bit_fault_address(struct pt_regs *regs,
 
 asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 {
+       enum ctx_state prev_state = exception_enter();
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned int insn = 0;
@@ -282,7 +284,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
        fault_code = get_thread_fault_code();
 
        if (notify_page_fault(regs))
-               return;
+               goto exit_exception;
 
        si_code = SEGV_MAPERR;
        address = current_thread_info()->fault_address;
@@ -313,7 +315,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
                        /* Valid, no problems... */
                } else {
                        bad_kernel_pc(regs, address);
-                       return;
+                       goto exit_exception;
                }
        } else
                flags |= FAULT_FLAG_USER;
@@ -430,7 +432,7 @@ good_area:
        fault = handle_mm_fault(mm, vma, address, flags);
 
        if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
-               return;
+               goto exit_exception;
 
        if (unlikely(fault & VM_FAULT_ERROR)) {
                if (fault & VM_FAULT_OOM)
@@ -482,6 +484,8 @@ good_area:
 
        }
 #endif
+exit_exception:
+       exception_exit(prev_state);
        return;
 
        /*
@@ -494,7 +498,7 @@ bad_area:
 
 handle_kernel_fault:
        do_kernel_fault(regs, si_code, fault_code, insn, address);
-       return;
+       goto exit_exception;
 
 /*
  * We ran out of memory, or some other thing happened to us that made
@@ -505,7 +509,7 @@ out_of_memory:
        up_read(&mm->mmap_sem);
        if (!(regs->tstate & TSTATE_PRIV)) {
                pagefault_out_of_memory();
-               return;
+               goto exit_exception;
        }
        goto handle_kernel_fault;
 
index 01ee23d..c4d3da6 100644 (file)
@@ -71,13 +71,12 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                        int *nr)
 {
        struct page *head, *page, *tail;
-       u32 mask;
        int refs;
 
-       mask = PMD_HUGE_PRESENT;
-       if (write)
-               mask |= PMD_HUGE_WRITE;
-       if ((pmd_val(pmd) & mask) != mask)
+       if (!pmd_large(pmd))
+               return 0;
+
+       if (write && !pmd_write(pmd))
                return 0;
 
        refs = 0;
index 9639964..3096317 100644 (file)
@@ -21,8 +21,6 @@
 /* Slightly simplified from the non-hugepage variant because by
  * definition we don't have to worry about any page coloring stuff
  */
-#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL))
-#define VA_EXCLUDE_END   (0xfffff80000000000UL + (1UL << 32UL))
 
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
                                                        unsigned long addr,
index ed82eda..6b64379 100644 (file)
@@ -354,7 +354,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
        if (mm->context.huge_pte_count && is_hugetlb_pte(pte))
-               __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT,
+               __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
                                        address, pte_val(pte));
        else
 #endif
@@ -1557,6 +1557,96 @@ unsigned long __init find_ecache_flush_span(unsigned long size)
        return ~0UL;
 }
 
+unsigned long PAGE_OFFSET;
+EXPORT_SYMBOL(PAGE_OFFSET);
+
+static void __init page_offset_shift_patch_one(unsigned int *insn, unsigned long phys_bits)
+{
+       unsigned long final_shift;
+       unsigned int val = *insn;
+       unsigned int cnt;
+
+       /* We are patching in ilog2(max_supported_phys_address), and
+        * we are doing so in a manner similar to a relocation addend.
+        * That is, we are adding the shift value to whatever value
+        * is in the shift instruction count field already.
+        */
+       cnt = (val & 0x3f);
+       val &= ~0x3f;
+
+       /* If we are trying to shift >= 64 bits, clear the destination
+        * register.  This can happen when phys_bits ends up being equal
+        * to MAX_PHYS_ADDRESS_BITS.
+        */
+       final_shift = (cnt + (64 - phys_bits));
+       if (final_shift >= 64) {
+               unsigned int rd = (val >> 25) & 0x1f;
+
+               val = 0x80100000 | (rd << 25);
+       } else {
+               val |= final_shift;
+       }
+       *insn = val;
+
+       __asm__ __volatile__("flush     %0"
+                            : /* no outputs */
+                            : "r" (insn));
+}
+
+static void __init page_offset_shift_patch(unsigned long phys_bits)
+{
+       extern unsigned int __page_offset_shift_patch;
+       extern unsigned int __page_offset_shift_patch_end;
+       unsigned int *p;
+
+       p = &__page_offset_shift_patch;
+       while (p < &__page_offset_shift_patch_end) {
+               unsigned int *insn = (unsigned int *)(unsigned long)*p;
+
+               page_offset_shift_patch_one(insn, phys_bits);
+
+               p++;
+       }
+}
+
+static void __init setup_page_offset(void)
+{
+       unsigned long max_phys_bits = 40;
+
+       if (tlb_type == cheetah || tlb_type == cheetah_plus) {
+               max_phys_bits = 42;
+       } else if (tlb_type == hypervisor) {
+               switch (sun4v_chip_type) {
+               case SUN4V_CHIP_NIAGARA1:
+               case SUN4V_CHIP_NIAGARA2:
+                       max_phys_bits = 39;
+                       break;
+               case SUN4V_CHIP_NIAGARA3:
+                       max_phys_bits = 43;
+                       break;
+               case SUN4V_CHIP_NIAGARA4:
+               case SUN4V_CHIP_NIAGARA5:
+               case SUN4V_CHIP_SPARC64X:
+               default:
+                       max_phys_bits = 47;
+                       break;
+               }
+       }
+
+       if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) {
+               prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n",
+                           max_phys_bits);
+               prom_halt();
+       }
+
+       PAGE_OFFSET = PAGE_OFFSET_BY_BITS(max_phys_bits);
+
+       pr_info("PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n",
+               PAGE_OFFSET, max_phys_bits);
+
+       page_offset_shift_patch(max_phys_bits);
+}
+
 static void __init tsb_phys_patch(void)
 {
        struct tsb_ldquad_phys_patch_entry *pquad;
@@ -1722,7 +1812,7 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 #ifndef CONFIG_DEBUG_PAGEALLOC
        if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) {
                kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
-                       0xfffff80000000000UL;
+                       PAGE_OFFSET;
                kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
                                           _PAGE_P_4V | _PAGE_W_4V);
        } else {
@@ -1731,7 +1821,7 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 
        if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) {
                kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^
-                       0xfffff80000000000UL;
+                       PAGE_OFFSET;
                kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V |
                                           _PAGE_P_4V | _PAGE_W_4V);
        } else {
@@ -1740,7 +1830,7 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 
        if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) {
                kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^
-                       0xfffff80000000000UL;
+                       PAGE_OFFSET;
                kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V |
                                           _PAGE_P_4V | _PAGE_W_4V);
        } else {
@@ -1752,7 +1842,7 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 /* paging_init() sets up the page tables */
 
 static unsigned long last_valid_pfn;
-pgd_t swapper_pg_dir[2048];
+pgd_t swapper_pg_dir[PTRS_PER_PGD];
 
 static void sun4u_pgprot_init(void);
 static void sun4v_pgprot_init(void);
@@ -1763,6 +1853,8 @@ void __init paging_init(void)
        unsigned long real_end, i;
        int node;
 
+       setup_page_offset();
+
        /* These build time checkes make sure that the dcache_dirty_cpu()
         * page->flags usage will work.
         *
@@ -2261,10 +2353,10 @@ static void __init sun4u_pgprot_init(void)
                     __ACCESS_BITS_4U | _PAGE_E_4U);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-       kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
+       kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
 #else
        kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^
-               0xfffff80000000000UL;
+               PAGE_OFFSET;
 #endif
        kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U |
                                   _PAGE_P_4U | _PAGE_W_4U);
@@ -2308,10 +2400,10 @@ static void __init sun4v_pgprot_init(void)
        _PAGE_CACHE = _PAGE_CACHE_4V;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-       kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL;
+       kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
 #else
        kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
-               0xfffff80000000000UL;
+               PAGE_OFFSET;
 #endif
        kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
                                   _PAGE_P_4V | _PAGE_W_4V);
@@ -2455,53 +2547,13 @@ void __flush_tlb_all(void)
                             : : "r" (pstate));
 }
 
-static pte_t *get_from_cache(struct mm_struct *mm)
-{
-       struct page *page;
-       pte_t *ret;
-
-       spin_lock(&mm->page_table_lock);
-       page = mm->context.pgtable_page;
-       ret = NULL;
-       if (page) {
-               void *p = page_address(page);
-
-               mm->context.pgtable_page = NULL;
-
-               ret = (pte_t *) (p + (PAGE_SIZE / 2));
-       }
-       spin_unlock(&mm->page_table_lock);
-
-       return ret;
-}
-
-static struct page *__alloc_for_cache(struct mm_struct *mm)
-{
-       struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
-                                      __GFP_REPEAT | __GFP_ZERO);
-
-       if (page) {
-               spin_lock(&mm->page_table_lock);
-               if (!mm->context.pgtable_page) {
-                       atomic_set(&page->_count, 2);
-                       mm->context.pgtable_page = page;
-               }
-               spin_unlock(&mm->page_table_lock);
-       }
-       return page;
-}
-
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
                            unsigned long address)
 {
-       struct page *page;
-       pte_t *pte;
-
-       pte = get_from_cache(mm);
-       if (pte)
-               return pte;
+       struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
+                                      __GFP_REPEAT | __GFP_ZERO);
+       pte_t *pte = NULL;
 
-       page = __alloc_for_cache(mm);
        if (page)
                pte = (pte_t *) page_address(page);
 
@@ -2511,36 +2563,30 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 pgtable_t pte_alloc_one(struct mm_struct *mm,
                        unsigned long address)
 {
-       struct page *page;
-       pte_t *pte;
-
-       pte = get_from_cache(mm);
-       if (pte)
-               return pte;
+       struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
+                                      __GFP_REPEAT | __GFP_ZERO);
+       pte_t *pte = NULL;
 
-       page = __alloc_for_cache(mm);
-       if (page) {
-               pgtable_page_ctor(page);
-               pte = (pte_t *) page_address(page);
+       if (!page)
+               return NULL;
+       if (!pgtable_page_ctor(page)) {
+               free_hot_cold_page(page, 0);
+               return NULL;
        }
-
-       return pte;
+       return (pte_t *) page_address(page);
 }
 
 void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-       struct page *page = virt_to_page(pte);
-       if (put_page_testzero(page))
-               free_hot_cold_page(page, 0);
+       free_page((unsigned long)pte);
 }
 
 static void __pte_free(pgtable_t pte)
 {
        struct page *page = virt_to_page(pte);
-       if (put_page_testzero(page)) {
-               pgtable_page_dtor(page);
-               free_hot_cold_page(page, 0);
-       }
+
+       pgtable_page_dtor(page);
+       __free_page(page);
 }
 
 void pte_free(struct mm_struct *mm, pgtable_t pte)
@@ -2557,124 +2603,27 @@ void pgtable_free(void *table, bool is_page)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify)
-{
-       if (pgprot_val(pgprot) & _PAGE_VALID)
-               pmd_val(pmd) |= PMD_HUGE_PRESENT;
-       if (tlb_type == hypervisor) {
-               if (pgprot_val(pgprot) & _PAGE_WRITE_4V)
-                       pmd_val(pmd) |= PMD_HUGE_WRITE;
-               if (pgprot_val(pgprot) & _PAGE_EXEC_4V)
-                       pmd_val(pmd) |= PMD_HUGE_EXEC;
-
-               if (!for_modify) {
-                       if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V)
-                               pmd_val(pmd) |= PMD_HUGE_ACCESSED;
-                       if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V)
-                               pmd_val(pmd) |= PMD_HUGE_DIRTY;
-               }
-       } else {
-               if (pgprot_val(pgprot) & _PAGE_WRITE_4U)
-                       pmd_val(pmd) |= PMD_HUGE_WRITE;
-               if (pgprot_val(pgprot) & _PAGE_EXEC_4U)
-                       pmd_val(pmd) |= PMD_HUGE_EXEC;
-
-               if (!for_modify) {
-                       if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U)
-                               pmd_val(pmd) |= PMD_HUGE_ACCESSED;
-                       if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U)
-                               pmd_val(pmd) |= PMD_HUGE_DIRTY;
-               }
-       }
-
-       return pmd;
-}
-
-pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
-{
-       pmd_t pmd;
-
-       pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT)));
-       pmd_val(pmd) |= PMD_ISHUGE;
-       pmd = pmd_set_protbits(pmd, pgprot, false);
-       return pmd;
-}
-
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
-       pmd_val(pmd) &= ~(PMD_HUGE_PRESENT |
-                         PMD_HUGE_WRITE |
-                         PMD_HUGE_EXEC);
-       pmd = pmd_set_protbits(pmd, newprot, true);
-       return pmd;
-}
-
-pgprot_t pmd_pgprot(pmd_t entry)
-{
-       unsigned long pte = 0;
-
-       if (pmd_val(entry) & PMD_HUGE_PRESENT)
-               pte |= _PAGE_VALID;
-
-       if (tlb_type == hypervisor) {
-               if (pmd_val(entry) & PMD_HUGE_PRESENT)
-                       pte |= _PAGE_PRESENT_4V;
-               if (pmd_val(entry) & PMD_HUGE_EXEC)
-                       pte |= _PAGE_EXEC_4V;
-               if (pmd_val(entry) & PMD_HUGE_WRITE)
-                       pte |= _PAGE_W_4V;
-               if (pmd_val(entry) & PMD_HUGE_ACCESSED)
-                       pte |= _PAGE_ACCESSED_4V;
-               if (pmd_val(entry) & PMD_HUGE_DIRTY)
-                       pte |= _PAGE_MODIFIED_4V;
-               pte |= _PAGE_CP_4V|_PAGE_CV_4V;
-       } else {
-               if (pmd_val(entry) & PMD_HUGE_PRESENT)
-                       pte |= _PAGE_PRESENT_4U;
-               if (pmd_val(entry) & PMD_HUGE_EXEC)
-                       pte |= _PAGE_EXEC_4U;
-               if (pmd_val(entry) & PMD_HUGE_WRITE)
-                       pte |= _PAGE_W_4U;
-               if (pmd_val(entry) & PMD_HUGE_ACCESSED)
-                       pte |= _PAGE_ACCESSED_4U;
-               if (pmd_val(entry) & PMD_HUGE_DIRTY)
-                       pte |= _PAGE_MODIFIED_4U;
-               pte |= _PAGE_CP_4U|_PAGE_CV_4U;
-       }
-
-       return __pgprot(pte);
-}
-
 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
                          pmd_t *pmd)
 {
        unsigned long pte, flags;
        struct mm_struct *mm;
        pmd_t entry = *pmd;
-       pgprot_t prot;
 
        if (!pmd_large(entry) || !pmd_young(entry))
                return;
 
-       pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS);
-       pte <<= PMD_PADDR_SHIFT;
-       pte |= _PAGE_VALID;
-
-       prot = pmd_pgprot(entry);
-
-       if (tlb_type == hypervisor)
-               pgprot_val(prot) |= _PAGE_SZHUGE_4V;
-       else
-               pgprot_val(prot) |= _PAGE_SZHUGE_4U;
+       pte = pmd_val(entry);
 
-       pte |= pgprot_val(prot);
+       /* We are fabricating 8MB pages using 4MB real hw pages.  */
+       pte |= (addr & (1UL << REAL_HPAGE_SHIFT));
 
        mm = vma->vm_mm;
 
        spin_lock_irqsave(&mm->context.lock, flags);
 
        if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL)
-               __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT,
+               __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
                                        addr, pte);
 
        spin_unlock_irqrestore(&mm->context.lock, flags);
index 0661aa6..5d3782d 100644 (file)
@@ -1,11 +1,13 @@
 #ifndef _SPARC64_MM_INIT_H
 #define _SPARC64_MM_INIT_H
 
+#include <asm/page.h>
+
 /* Most of the symbols in this file are defined in init.c and
  * marked non-static so that assembler code can get at them.
  */
 
-#define MAX_PHYS_ADDRESS       (1UL << 41UL)
+#define MAX_PHYS_ADDRESS       (1UL << MAX_PHYS_ADDRESS_BITS)
 #define KPTE_BITMAP_CHUNK_SZ           (256UL * 1024UL * 1024UL)
 #define KPTE_BITMAP_BYTES      \
        ((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 4)
index 5d721df..869023a 100644 (file)
@@ -345,7 +345,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
        if ((pte = (unsigned long)pte_alloc_one_kernel(mm, address)) == 0)
                return NULL;
        page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT);
-       pgtable_page_ctor(page);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
        return page;
 }
 
index 7a91f28..ad3bf4b 100644 (file)
@@ -161,8 +161,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
        if (mm == &init_mm)
                return;
 
-       if ((pmd_val(pmd) ^ pmd_val(orig)) & PMD_ISHUGE) {
-               if (pmd_val(pmd) & PMD_ISHUGE)
+       if ((pmd_val(pmd) ^ pmd_val(orig)) & _PAGE_PMD_HUGE) {
+               if (pmd_val(pmd) & _PAGE_PMD_HUGE)
                        mm->context.huge_pte_count++;
                else
                        mm->context.huge_pte_count--;
@@ -178,13 +178,16 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
        }
 
        if (!pmd_none(orig)) {
-               bool exec = ((pmd_val(orig) & PMD_HUGE_EXEC) != 0);
+               pte_t orig_pte = __pte(pmd_val(orig));
+               bool exec = pte_exec(orig_pte);
 
                addr &= HPAGE_MASK;
-               if (pmd_val(orig) & PMD_ISHUGE)
+               if (pmd_trans_huge(orig)) {
                        tlb_batch_add_one(mm, addr, exec);
-               else
+                       tlb_batch_add_one(mm, addr + REAL_HPAGE_SIZE, exec);
+               } else {
                        tlb_batch_pmd_scan(mm, addr, orig, exec);
+               }
        }
 }
 
@@ -196,11 +199,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
        assert_spin_locked(&mm->page_table_lock);
 
        /* FIFO */
-       if (!mm->pmd_huge_pte)
+       if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(lh);
        else
-               list_add(lh, (struct list_head *) mm->pmd_huge_pte);
-       mm->pmd_huge_pte = pgtable;
+               list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+       pmd_huge_pte(mm, pmdp) = pgtable;
 }
 
 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
@@ -211,12 +214,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
        assert_spin_locked(&mm->page_table_lock);
 
        /* FIFO */
-       pgtable = mm->pmd_huge_pte;
+       pgtable = pmd_huge_pte(mm, pmdp);
        lh = (struct list_head *) pgtable;
        if (list_empty(lh))
-               mm->pmd_huge_pte = NULL;
+               pmd_huge_pte(mm, pmdp) = NULL;
        else {
-               mm->pmd_huge_pte = (pgtable_t) lh->next;
+               pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
                list_del(lh);
        }
        pte_val(pgtable[0]) = 0;
index 2cc3bce..3b3a360 100644 (file)
@@ -87,7 +87,7 @@ void flush_tsb_user(struct tlb_batch *tb)
                nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
                if (tlb_type == cheetah_plus || tlb_type == hypervisor)
                        base = __pa(base);
-               __flush_tsb_one(tb, HPAGE_SHIFT, base, nentries);
+               __flush_tsb_one(tb, REAL_HPAGE_SHIFT, base, nentries);
        }
 #endif
        spin_unlock_irqrestore(&mm->context.lock, flags);
@@ -111,7 +111,7 @@ void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr)
                nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
                if (tlb_type == cheetah_plus || tlb_type == hypervisor)
                        base = __pa(base);
-               __flush_tsb_one_entry(base, vaddr, HPAGE_SHIFT, nentries);
+               __flush_tsb_one_entry(base, vaddr, REAL_HPAGE_SHIFT, nentries);
        }
 #endif
        spin_unlock_irqrestore(&mm->context.lock, flags);
@@ -472,8 +472,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
        mm->context.huge_pte_count = 0;
 #endif
 
-       mm->context.pgtable_page = NULL;
-
        /* copy_mm() copies over the parent's mm_struct before calling
         * us, so we need to zero out the TSB pointer or else tsb_grow()
         * will be confused and think there is an older TSB to free up.
@@ -512,17 +510,10 @@ static void tsb_destroy_one(struct tsb_config *tp)
 void destroy_context(struct mm_struct *mm)
 {
        unsigned long flags, i;
-       struct page *page;
 
        for (i = 0; i < MM_NUM_TSBS; i++)
                tsb_destroy_one(&mm->context.tsb_block[i]);
 
-       page = mm->context.pgtable_page;
-       if (page && put_page_testzero(page)) {
-               pgtable_page_dtor(page);
-               free_hot_cold_page(page, 0);
-       }
-
        spin_lock_irqsave(&ctx_alloc_lock, flags);
 
        if (CTX_VALID(mm->context)) {
index 432aa0c..b4f4733 100644 (file)
@@ -153,10 +153,10 @@ __spitfire_flush_tlb_mm_slow:
        .globl          __flush_icache_page
 __flush_icache_page:   /* %o0 = phys_page */
        srlx            %o0, PAGE_SHIFT, %o0
-       sethi           %uhi(PAGE_OFFSET), %g1
+       sethi           %hi(PAGE_OFFSET), %g1
        sllx            %o0, PAGE_SHIFT, %o0
        sethi           %hi(PAGE_SIZE), %g2
-       sllx            %g1, 32, %g1
+       ldx             [%g1 + %lo(PAGE_OFFSET)], %g1
        add             %o0, %g1, %o0
 1:     subcc           %g2, 32, %g2
        bne,pt          %icc, 1b
@@ -178,8 +178,8 @@ __flush_icache_page:        /* %o0 = phys_page */
        .align          64
        .globl          __flush_dcache_page
 __flush_dcache_page:   /* %o0=kaddr, %o1=flush_icache */
-       sethi           %uhi(PAGE_OFFSET), %g1
-       sllx            %g1, 32, %g1
+       sethi           %hi(PAGE_OFFSET), %g1
+       ldx             [%g1 + %lo(PAGE_OFFSET)], %g1
        sub             %o0, %g1, %o0                   ! physical address
        srlx            %o0, 11, %o0                    ! make D-cache TAG
        sethi           %hi(1 << 14), %o2               ! D-cache size
@@ -287,8 +287,8 @@ __cheetah_flush_tlb_pending:        /* 27 insns */
 
 #ifdef DCACHE_ALIASING_POSSIBLE
 __cheetah_flush_dcache_page: /* 11 insns */
-       sethi           %uhi(PAGE_OFFSET), %g1
-       sllx            %g1, 32, %g1
+       sethi           %hi(PAGE_OFFSET), %g1
+       ldx             [%g1 + %lo(PAGE_OFFSET)], %g1
        sub             %o0, %g1, %o0
        sethi           %hi(PAGE_SIZE), %o4
 1:     subcc           %o4, (1 << 5), %o4
index d45a2c4..b3692ce 100644 (file)
@@ -8,7 +8,6 @@ config TILE
        select HAVE_KVM if !TILEGX
        select GENERIC_FIND_FIRST_BIT
        select SYSCTL_EXCEPTION_TRACE
-       select USE_GENERIC_SMP_HELPERS
        select CC_OPTIMIZE_FOR_SIZE
        select HAVE_DEBUG_KMEMLEAK
        select GENERIC_IRQ_PROBE
index 4fd9ec0..5e86eac 100644 (file)
@@ -241,6 +241,11 @@ struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
        if (p == NULL)
                return NULL;
 
+       if (!pgtable_page_ctor(p)) {
+               __free_pages(p, L2_USER_PGTABLE_ORDER);
+               return NULL;
+       }
+
        /*
         * Make every page have a page_count() of one, not just the first.
         * We don't use __GFP_COMP since it doesn't look like it works
@@ -251,7 +256,6 @@ struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
                inc_zone_page_state(p+i, NR_PAGETABLE);
        }
 
-       pgtable_page_ctor(p);
        return p;
 }
 
index 7ddb64b..8636e90 100644 (file)
@@ -279,8 +279,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
        struct page *pte;
 
        pte = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-       if (pte)
-               pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
index 0213e37..2e02d13 100644 (file)
@@ -51,12 +51,14 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr)
        struct page *pte;
 
        pte = alloc_pages(PGALLOC_GFP, 0);
-       if (pte) {
-               if (!PageHighMem(pte)) {
-                       void *page = page_address(pte);
-                       clean_dcache_area(page, PTRS_PER_PTE * sizeof(pte_t));
-               }
-               pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       if (!PageHighMem(pte)) {
+               void *page = page_address(pte);
+               clean_dcache_area(page, PTRS_PER_PTE * sizeof(pte_t));
+       }
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
        }
 
        return pte;
index 6e3e1cb..83f521a 100644 (file)
@@ -90,7 +90,6 @@ config X86
        select GENERIC_IRQ_SHOW
        select GENERIC_CLOCKEVENTS_MIN_ADJUST
        select IRQ_FORCED_THREADING
-       select USE_GENERIC_SMP_HELPERS if SMP
        select HAVE_BPF_JIT if X86_64
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select CLKEVT_I8253
@@ -1885,6 +1884,10 @@ config USE_PERCPU_NUMA_NODE_ID
        def_bool y
        depends on NUMA
 
+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+       def_bool y
+       depends on X86_64 || X86_PAE
+
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
index 15f960c..24ec121 100644 (file)
@@ -274,13 +274,17 @@ struct x86_emulate_ctxt {
 
        bool guest_mode; /* guest running a nested guest */
        bool perm_ok; /* do not check permissions if true */
-       bool only_vendor_specific_insn;
+       bool ud;        /* inject an #UD if host doesn't support insn */
 
        bool have_exception;
        struct x86_exception exception;
 
-       /* decode cache */
-       u8 twobyte;
+       /*
+        * decode cache
+        */
+
+       /* current opcode length in bytes */
+       u8 opcode_len;
        u8 b;
        u8 intercept;
        u8 lock_prefix;
index c76ff74..ae5d783 100644 (file)
 #define KVM_HPAGE_MASK(x)      (~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+       /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+       return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+               (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
 #define SELECTOR_TI_MASK (1 << 2)
 #define SELECTOR_RPL_MASK 0x03
 
@@ -253,7 +260,6 @@ struct kvm_pio_request {
  * mode.
  */
 struct kvm_mmu {
-       void (*new_cr3)(struct kvm_vcpu *vcpu);
        void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
        unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
        u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
@@ -261,7 +267,6 @@ struct kvm_mmu {
                          bool prefault);
        void (*inject_page_fault)(struct kvm_vcpu *vcpu,
                                  struct x86_exception *fault);
-       void (*free)(struct kvm_vcpu *vcpu);
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
                            struct x86_exception *exception);
        gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
@@ -389,6 +394,8 @@ struct kvm_vcpu_arch {
 
        struct fpu guest_fpu;
        u64 xcr0;
+       u64 guest_supported_xcr0;
+       u32 guest_xstate_size;
 
        struct kvm_pio_request pio;
        void *pio_data;
@@ -557,7 +564,9 @@ struct kvm_arch {
 
        struct list_head assigned_dev_head;
        struct iommu_domain *iommu_domain;
-       int iommu_flags;
+       bool iommu_noncoherent;
+#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
+       atomic_t noncoherent_dma_count;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
@@ -780,11 +789,11 @@ void kvm_mmu_module_exit(void);
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
                                     struct kvm_memory_slot *slot,
@@ -922,13 +931,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
                       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
 
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
 
-int complete_pio(struct kvm_vcpu *vcpu);
-bool kvm_check_iopl(struct kvm_vcpu *vcpu);
-
 static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
        return gpa;
index b4389a4..c4412e9 100644 (file)
@@ -80,12 +80,21 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 #if PAGETABLE_LEVELS > 2
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-       return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+       struct page *page;
+       page = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0);
+       if (!page)
+               return NULL;
+       if (!pgtable_pmd_page_ctor(page)) {
+               __free_pages(page, 0);
+               return NULL;
+       }
+       return (pmd_t *)page_address(page);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
        BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+       pgtable_pmd_page_dtor(virt_to_page(pmd));
        free_page((unsigned long)pmd);
 }
 
index be8269b..d6b078e 100644 (file)
@@ -14,6 +14,8 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
                            struct timespec *ts);
 void pvclock_resume(void);
 
+void pvclock_touch_watchdogs(void);
+
 /*
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h
new file mode 100644 (file)
index 0000000..7f02fe4
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef _ASM_X86_XEN_PAGE_COHERENT_H
+#define _ASM_X86_XEN_PAGE_COHERENT_H
+
+#include <asm/page.h>
+#include <linux/dma-attrs.h>
+#include <linux/dma-mapping.h>
+
+static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size,
+               dma_addr_t *dma_handle, gfp_t flags,
+               struct dma_attrs *attrs)
+{
+       void *vstart = (void*)__get_free_pages(flags, get_order(size));
+       *dma_handle = virt_to_phys(vstart);
+       return vstart;
+}
+
+static inline void xen_free_coherent_pages(struct device *hwdev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle,
+               struct dma_attrs *attrs)
+{
+       free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
+            unsigned long offset, size_t size, enum dma_data_direction dir,
+            struct dma_attrs *attrs) { }
+
+static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
+               size_t size, enum dma_data_direction dir,
+               struct dma_attrs *attrs) { }
+
+static inline void xen_dma_sync_single_for_cpu(struct device *hwdev,
+               dma_addr_t handle, size_t size, enum dma_data_direction dir) { }
+
+static inline void xen_dma_sync_single_for_device(struct device *hwdev,
+               dma_addr_t handle, size_t size, enum dma_data_direction dir) { }
+
+#endif /* _ASM_X86_XEN_PAGE_COHERENT_H */
index 5d9a303..d3a8778 100644 (file)
@@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 {
        __u32 padding[3];
 };
 
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
-#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
-#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX                BIT(0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC           BIT(1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT         BIT(2)
 
 /* for KVM_SET_CPUID2 */
 struct kvm_cpuid2 {
index bb04650..b93e09a 100644 (file)
 
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
+#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
 
 #define MSR_VM_CR                       0xc0010114
index 1570e07..e604109 100644 (file)
@@ -139,6 +139,7 @@ bool kvm_check_and_clear_guest_paused(void)
        src = &hv_clock[cpu].pvti;
        if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
                src->flags &= ~PVCLOCK_GUEST_STOPPED;
+               pvclock_touch_watchdogs();
                ret = true;
        }
 
index a16bae3..2f355d2 100644 (file)
@@ -43,6 +43,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
        return pv_tsc_khz;
 }
 
+void pvclock_touch_watchdogs(void)
+{
+       touch_softlockup_watchdog_sync();
+       clocksource_touch_watchdog();
+       rcu_cpu_stall_reset();
+       reset_hung_task_detector();
+}
+
 static atomic64_t last_value = ATOMIC64_INIT(0);
 
 void pvclock_resume(void)
@@ -74,6 +82,11 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
                version = __pvclock_read_cycles(src, &ret, &flags);
        } while ((src->version & 1) || version != src->version);
 
+       if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+               src->flags &= ~PVCLOCK_GUEST_STOPPED;
+               pvclock_touch_watchdogs();
+       }
+
        if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
                (flags & PVCLOCK_TSC_STABLE_BIT))
                return ret;
index a47a3e5..b89c5db 100644 (file)
@@ -38,6 +38,7 @@ config KVM
        select PERF_EVENTS
        select HAVE_KVM_MSI
        select HAVE_KVM_CPU_RELAX_INTERCEPT
+       select KVM_VFIO
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
index bf4fb04..25d22b2 100644 (file)
@@ -9,7 +9,7 @@ KVM := ../../../virt/kvm
 
 kvm-y                  += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
                                $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
-                               $(KVM)/eventfd.o $(KVM)/irqchip.o
+                               $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)    += $(KVM)/assigned-dev.o $(KVM)/iommu.o
 kvm-$(CONFIG_KVM_ASYNC_PF)     += $(KVM)/async_pf.o
 
index b110fe6..c697625 100644 (file)
 #include "mmu.h"
 #include "trace.h"
 
+static u32 xstate_required_size(u64 xstate_bv)
+{
+       int feature_bit = 0;
+       u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
+       xstate_bv &= ~XSTATE_FPSSE;
+       while (xstate_bv) {
+               if (xstate_bv & 0x1) {
+                       u32 eax, ebx, ecx, edx;
+                       cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
+                       ret = max(ret, eax + ebx);
+               }
+
+               xstate_bv >>= 1;
+               feature_bit++;
+       }
+
+       return ret;
+}
+
 void kvm_update_cpuid(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
@@ -46,6 +66,18 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
                        apic->lapic_timer.timer_mode_mask = 1 << 17;
        }
 
+       best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+       if (!best) {
+               vcpu->arch.guest_supported_xcr0 = 0;
+               vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+       } else {
+               vcpu->arch.guest_supported_xcr0 =
+                       (best->eax | ((u64)best->edx << 32)) &
+                       host_xcr0 & KVM_SUPPORTED_XCR0;
+               vcpu->arch.guest_xstate_size =
+                       xstate_required_size(vcpu->arch.guest_supported_xcr0);
+       }
+
        kvm_pmu_cpuid_update(vcpu);
 }
 
@@ -182,13 +214,35 @@ static bool supported_xcr0_bit(unsigned bit)
 {
        u64 mask = ((u64)1 << bit);
 
-       return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+       return mask & KVM_SUPPORTED_XCR0 & host_xcr0;
 }
 
 #define F(x) bit(X86_FEATURE_##x)
 
-static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-                        u32 index, int *nent, int maxnent)
+static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
+                                  u32 func, u32 index, int *nent, int maxnent)
+{
+       switch (func) {
+       case 0:
+               entry->eax = 1;         /* only one leaf currently */
+               ++*nent;
+               break;
+       case 1:
+               entry->ecx = F(MOVBE);
+               ++*nent;
+               break;
+       default:
+               break;
+       }
+
+       entry->function = func;
+       entry->index = index;
+
+       return 0;
+}
+
+static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                                u32 index, int *nent, int maxnent)
 {
        int r;
        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
@@ -383,6 +437,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        case 0xd: {
                int idx, i;
 
+               entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0;
+               entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32;
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                for (idx = 1, i = 1; idx < 64; ++idx) {
                        if (*nent >= maxnent)
@@ -481,6 +537,15 @@ out:
        return r;
 }
 
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
+                       u32 idx, int *nent, int maxnent, unsigned int type)
+{
+       if (type == KVM_GET_EMULATED_CPUID)
+               return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
+
+       return __do_cpuid_ent(entry, func, idx, nent, maxnent);
+}
+
 #undef F
 
 struct kvm_cpuid_param {
@@ -495,8 +560,36 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
        return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
 }
 
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-                                     struct kvm_cpuid_entry2 __user *entries)
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
+                                __u32 num_entries, unsigned int ioctl_type)
+{
+       int i;
+       __u32 pad[3];
+
+       if (ioctl_type != KVM_GET_EMULATED_CPUID)
+               return false;
+
+       /*
+        * We want to make sure that ->padding is being passed clean from
+        * userspace in case we want to use it for something in the future.
+        *
+        * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+        * have to give ourselves satisfied only with the emulated side. /me
+        * sheds a tear.
+        */
+       for (i = 0; i < num_entries; i++) {
+               if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
+                       return true;
+
+               if (pad[0] || pad[1] || pad[2])
+                       return true;
+       }
+       return false;
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+                           struct kvm_cpuid_entry2 __user *entries,
+                           unsigned int type)
 {
        struct kvm_cpuid_entry2 *cpuid_entries;
        int limit, nent = 0, r = -E2BIG, i;
@@ -513,8 +606,12 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                goto out;
        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
                cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+
+       if (sanity_check_entries(entries, cpuid->nent, type))
+               return -EINVAL;
+
        r = -ENOMEM;
-       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+       cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
        if (!cpuid_entries)
                goto out;
 
@@ -526,7 +623,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                        continue;
 
                r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
-                               &nent, cpuid->nent);
+                               &nent, cpuid->nent, type);
 
                if (r)
                        goto out_free;
@@ -537,7 +634,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                limit = cpuid_entries[nent - 1].eax;
                for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
                        r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
-                                    &nent, cpuid->nent);
+                                    &nent, cpuid->nent, type);
 
                if (r)
                        goto out_free;
@@ -661,6 +758,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
                *edx = best->edx;
        } else
                *eax = *ebx = *ecx = *edx = 0;
+       trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
 
@@ -676,6 +774,5 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
        kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
        kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
        kvm_x86_ops->skip_emulated_instruction(vcpu);
-       trace_kvm_cpuid(function, eax, ebx, ecx, edx);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
index b7fd079..f1e4895 100644 (file)
@@ -6,8 +6,9 @@
 void kvm_update_cpuid(struct kvm_vcpu *vcpu);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-                                     struct kvm_cpuid_entry2 __user *entries);
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+                           struct kvm_cpuid_entry2 __user *entries,
+                           unsigned int type);
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                             struct kvm_cpuid *cpuid,
                             struct kvm_cpuid_entry __user *entries);
index ddc3f3d..07ffca0 100644 (file)
 #define Mov         (1<<20)
 /* Misc flags */
 #define Prot        (1<<21) /* instruction generates #UD if not in prot-mode */
-#define VendorSpecific (1<<22) /* Vendor specific instruction */
+#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
 #define NoAccess    (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
 #define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
 #define Undefined   (1<<25) /* No Such Instruction */
@@ -785,9 +785,10 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
  * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
  */
 static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
-                            int highbyte_regs)
+                            int byteop)
 {
        void *p;
+       int highbyte_regs = (ctxt->rex_prefix == 0) && byteop;
 
        if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
                p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
@@ -1024,7 +1025,6 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
                                    struct operand *op)
 {
        unsigned reg = ctxt->modrm_reg;
-       int highbyte_regs = ctxt->rex_prefix == 0;
 
        if (!(ctxt->d & ModRM))
                reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
@@ -1045,13 +1045,9 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
        }
 
        op->type = OP_REG;
-       if (ctxt->d & ByteOp) {
-               op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
-               op->bytes = 1;
-       } else {
-               op->addr.reg = decode_register(ctxt, reg, 0);
-               op->bytes = ctxt->op_bytes;
-       }
+       op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+       op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
+
        fetch_register_operand(op);
        op->orig_val = op->val;
 }
@@ -1082,12 +1078,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
        ctxt->modrm_seg = VCPU_SREG_DS;
 
        if (ctxt->modrm_mod == 3) {
-               int highbyte_regs = ctxt->rex_prefix == 0;
-
                op->type = OP_REG;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
                op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
-                                              highbyte_regs && (ctxt->d & ByteOp));
+                               ctxt->d & ByteOp);
                if (ctxt->d & Sse) {
                        op->type = OP_XMM;
                        op->bytes = 16;
@@ -2961,6 +2955,46 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
 
+#define FFL(x) bit(X86_FEATURE_##x)
+
+static int em_movbe(struct x86_emulate_ctxt *ctxt)
+{
+       u32 ebx, ecx, edx, eax = 1;
+       u16 tmp;
+
+       /*
+        * Check MOVBE is set in the guest-visible CPUID leaf.
+        */
+       ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+       if (!(ecx & FFL(MOVBE)))
+               return emulate_ud(ctxt);
+
+       switch (ctxt->op_bytes) {
+       case 2:
+               /*
+                * From MOVBE definition: "...When the operand size is 16 bits,
+                * the upper word of the destination register remains unchanged
+                * ..."
+                *
+                * Both casting ->valptr and ->val to u16 breaks strict aliasing
+                * rules so we have to do the operation almost per hand.
+                */
+               tmp = (u16)ctxt->src.val;
+               ctxt->dst.val &= ~0xffffUL;
+               ctxt->dst.val |= (unsigned long)swab16(tmp);
+               break;
+       case 4:
+               ctxt->dst.val = swab32((u32)ctxt->src.val);
+               break;
+       case 8:
+               ctxt->dst.val = swab64(ctxt->src.val);
+               break;
+       default:
+               return X86EMUL_PROPAGATE_FAULT;
+       }
+       return X86EMUL_CONTINUE;
+}
+
 static int em_cr_write(struct x86_emulate_ctxt *ctxt)
 {
        if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
@@ -3256,6 +3290,18 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
 
+static int em_sahf(struct x86_emulate_ctxt *ctxt)
+{
+       u32 flags;
+
+       flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF;
+       flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
+
+       ctxt->eflags &= ~0xffUL;
+       ctxt->eflags |= flags | X86_EFLAGS_FIXED;
+       return X86EMUL_CONTINUE;
+}
+
 static int em_lahf(struct x86_emulate_ctxt *ctxt)
 {
        *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
@@ -3502,7 +3548,7 @@ static const struct opcode group7_rm1[] = {
 
 static const struct opcode group7_rm3[] = {
        DIP(SrcNone | Prot | Priv,              vmrun,          check_svme_pa),
-       II(SrcNone  | Prot | VendorSpecific,    em_vmmcall,     vmmcall),
+       II(SrcNone  | Prot | EmulateOnUD,       em_vmmcall,     vmmcall),
        DIP(SrcNone | Prot | Priv,              vmload,         check_svme_pa),
        DIP(SrcNone | Prot | Priv,              vmsave,         check_svme_pa),
        DIP(SrcNone | Prot | Priv,              stgi,           check_svme),
@@ -3587,7 +3633,7 @@ static const struct group_dual group7 = { {
        II(SrcMem16 | Mov | Priv,               em_lmsw, lmsw),
        II(SrcMem | ByteOp | Priv | NoAccess,   em_invlpg, invlpg),
 }, {
-       I(SrcNone | Priv | VendorSpecific,      em_vmcall),
+       I(SrcNone | Priv | EmulateOnUD, em_vmcall),
        EXT(0, group7_rm1),
        N, EXT(0, group7_rm3),
        II(SrcNone | DstMem | Mov,              em_smsw, smsw), N,
@@ -3750,7 +3796,8 @@ static const struct opcode opcode_table[256] = {
        D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
        I(SrcImmFAddr | No64, em_call_far), N,
        II(ImplicitOps | Stack, em_pushf, pushf),
-       II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
+       II(ImplicitOps | Stack, em_popf, popf),
+       I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
        /* 0xA0 - 0xA7 */
        I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
        I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3810,7 +3857,7 @@ static const struct opcode opcode_table[256] = {
 static const struct opcode twobyte_table[256] = {
        /* 0x00 - 0x0F */
        G(0, group6), GD(0, &group7), N, N,
-       N, I(ImplicitOps | VendorSpecific, em_syscall),
+       N, I(ImplicitOps | EmulateOnUD, em_syscall),
        II(ImplicitOps | Priv, em_clts, clts), N,
        DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
        N, D(ImplicitOps | ModRM), N, N,
@@ -3830,8 +3877,8 @@ static const struct opcode twobyte_table[256] = {
        IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
        II(ImplicitOps | Priv, em_rdmsr, rdmsr),
        IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
-       I(ImplicitOps | VendorSpecific, em_sysenter),
-       I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
+       I(ImplicitOps | EmulateOnUD, em_sysenter),
+       I(ImplicitOps | Priv | EmulateOnUD, em_sysexit),
        N, N,
        N, N, N, N, N, N, N, N,
        /* 0x40 - 0x4F */
@@ -3892,6 +3939,30 @@ static const struct opcode twobyte_table[256] = {
        N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
 };
 
+static const struct gprefix three_byte_0f_38_f0 = {
+       I(DstReg | SrcMem | Mov, em_movbe), N, N, N
+};
+
+static const struct gprefix three_byte_0f_38_f1 = {
+       I(DstMem | SrcReg | Mov, em_movbe), N, N, N
+};
+
+/*
+ * Insns below are selected by the prefix which indexed by the third opcode
+ * byte.
+ */
+static const struct opcode opcode_map_0f_38[256] = {
+       /* 0x00 - 0x7f */
+       X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+       /* 0x80 - 0xef */
+       X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+       /* 0xf0 - 0xf1 */
+       GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0),
+       GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1),
+       /* 0xf2 - 0xff */
+       N, N, X4(N), X8(N)
+};
+
 #undef D
 #undef N
 #undef G
@@ -4040,7 +4111,8 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
        case OpMem8:
                ctxt->memop.bytes = 1;
                if (ctxt->memop.type == OP_REG) {
-                       ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1);
+                       ctxt->memop.addr.reg = decode_register(ctxt,
+                                       ctxt->modrm_rm, true);
                        fetch_register_operand(&ctxt->memop);
                }
                goto mem_common;
@@ -4126,6 +4198,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
        ctxt->_eip = ctxt->eip;
        ctxt->fetch.start = ctxt->_eip;
        ctxt->fetch.end = ctxt->fetch.start + insn_len;
+       ctxt->opcode_len = 1;
        if (insn_len > 0)
                memcpy(ctxt->fetch.data, insn, insn_len);
 
@@ -4208,9 +4281,16 @@ done_prefixes:
        opcode = opcode_table[ctxt->b];
        /* Two-byte opcode? */
        if (ctxt->b == 0x0f) {
-               ctxt->twobyte = 1;
+               ctxt->opcode_len = 2;
                ctxt->b = insn_fetch(u8, ctxt);
                opcode = twobyte_table[ctxt->b];
+
+               /* 0F_38 opcode map */
+               if (ctxt->b == 0x38) {
+                       ctxt->opcode_len = 3;
+                       ctxt->b = insn_fetch(u8, ctxt);
+                       opcode = opcode_map_0f_38[ctxt->b];
+               }
        }
        ctxt->d = opcode.flags;
 
@@ -4267,7 +4347,7 @@ done_prefixes:
        if (ctxt->d == 0 || (ctxt->d & NotImpl))
                return EMULATION_FAILED;
 
-       if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+       if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
                return EMULATION_FAILED;
 
        if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
@@ -4540,8 +4620,10 @@ special_insn:
                goto writeback;
        }
 
-       if (ctxt->twobyte)
+       if (ctxt->opcode_len == 2)
                goto twobyte_insn;
+       else if (ctxt->opcode_len == 3)
+               goto threebyte_insn;
 
        switch (ctxt->b) {
        case 0x63:              /* movsxd */
@@ -4726,6 +4808,8 @@ twobyte_insn:
                goto cannot_emulate;
        }
 
+threebyte_insn:
+
        if (rc != X86EMUL_CONTINUE)
                goto done;
 
index dce0df8..40772ef 100644 (file)
@@ -2570,11 +2570,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        kvm_release_pfn_clean(pfn);
 }
 
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-       mmu_free_roots(vcpu);
-}
-
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
@@ -3424,18 +3419,11 @@ out_unlock:
        return 0;
 }
 
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
-       mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu *context)
+static void nonpaging_init_context(struct kvm_vcpu *vcpu,
+                                  struct kvm_mmu *context)
 {
-       context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = nonpaging_page_fault;
        context->gva_to_gpa = nonpaging_gva_to_gpa;
-       context->free = nonpaging_free;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -3444,7 +3432,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
        context->root_hpa = INVALID_PAGE;
        context->direct_map = true;
        context->nx = false;
-       return 0;
 }
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -3454,9 +3441,8 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
 {
-       pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
        mmu_free_roots(vcpu);
 }
 
@@ -3471,11 +3457,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
        vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
 
-static void paging_free(struct kvm_vcpu *vcpu)
-{
-       nonpaging_free(vcpu);
-}
-
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
                           unsigned access, int *nr_present)
 {
@@ -3665,9 +3646,9 @@ static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
        mmu->last_pte_bitmap = map;
 }
 
-static int paging64_init_context_common(struct kvm_vcpu *vcpu,
-                                       struct kvm_mmu *context,
-                                       int level)
+static void paging64_init_context_common(struct kvm_vcpu *vcpu,
+                                        struct kvm_mmu *context,
+                                        int level)
 {
        context->nx = is_nx(vcpu);
        context->root_level = level;
@@ -3677,27 +3658,24 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        update_last_pte_bitmap(vcpu, context);
 
        ASSERT(is_pae(vcpu));
-       context->new_cr3 = paging_new_cr3;
        context->page_fault = paging64_page_fault;
        context->gva_to_gpa = paging64_gva_to_gpa;
        context->sync_page = paging64_sync_page;
        context->invlpg = paging64_invlpg;
        context->update_pte = paging64_update_pte;
-       context->free = paging_free;
        context->shadow_root_level = level;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
-       return 0;
 }
 
-static int paging64_init_context(struct kvm_vcpu *vcpu,
-                                struct kvm_mmu *context)
+static void paging64_init_context(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu *context)
 {
-       return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
+       paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
 }
 
-static int paging32_init_context(struct kvm_vcpu *vcpu,
-                                struct kvm_mmu *context)
+static void paging32_init_context(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu *context)
 {
        context->nx = false;
        context->root_level = PT32_ROOT_LEVEL;
@@ -3706,33 +3684,28 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
 
-       context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
        context->gva_to_gpa = paging32_gva_to_gpa;
-       context->free = paging_free;
        context->sync_page = paging32_sync_page;
        context->invlpg = paging32_invlpg;
        context->update_pte = paging32_update_pte;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
-       return 0;
 }
 
-static int paging32E_init_context(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu *context)
+static void paging32E_init_context(struct kvm_vcpu *vcpu,
+                                  struct kvm_mmu *context)
 {
-       return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
+       paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
 
-static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = vcpu->arch.walk_mmu;
 
        context->base_role.word = 0;
-       context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = tdp_page_fault;
-       context->free = nonpaging_free;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -3767,37 +3740,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
-
-       return 0;
 }
 
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
-       int r;
        bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
        if (!is_paging(vcpu))
-               r = nonpaging_init_context(vcpu, context);
+               nonpaging_init_context(vcpu, context);
        else if (is_long_mode(vcpu))
-               r = paging64_init_context(vcpu, context);
+               paging64_init_context(vcpu, context);
        else if (is_pae(vcpu))
-               r = paging32E_init_context(vcpu, context);
+               paging32E_init_context(vcpu, context);
        else
-               r = paging32_init_context(vcpu, context);
+               paging32_init_context(vcpu, context);
 
        vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
        vcpu->arch.mmu.base_role.smep_andnot_wp
                = smep && !is_write_protection(vcpu);
-
-       return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
-int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
                bool execonly)
 {
        ASSERT(vcpu);
@@ -3806,37 +3774,30 @@ int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 
        context->nx = true;
-       context->new_cr3 = paging_new_cr3;
        context->page_fault = ept_page_fault;
        context->gva_to_gpa = ept_gva_to_gpa;
        context->sync_page = ept_sync_page;
        context->invlpg = ept_invlpg;
        context->update_pte = ept_update_pte;
-       context->free = paging_free;
        context->root_level = context->shadow_root_level;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
 
        update_permission_bitmask(vcpu, context, true);
        reset_rsvds_bits_mask_ept(vcpu, context, execonly);
-
-       return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-       int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
-
+       kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
        vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
        vcpu->arch.walk_mmu->get_cr3           = get_cr3;
        vcpu->arch.walk_mmu->get_pdptr         = kvm_pdptr_read;
        vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
-       return r;
 }
 
-static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
@@ -3873,11 +3834,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 
        update_permission_bitmask(vcpu, g_context, false);
        update_last_pte_bitmap(vcpu, g_context);
-
-       return 0;
 }
 
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
        if (mmu_is_nested(vcpu))
                return init_kvm_nested_mmu(vcpu);
@@ -3887,18 +3846,12 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
                return init_kvm_softmmu(vcpu);
 }
 
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-       if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
-               /* mmu.free() should set root_hpa = INVALID_PAGE */
-               vcpu->arch.mmu.free(vcpu);
-}
 
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
-       destroy_kvm_mmu(vcpu);
-       return init_kvm_mmu(vcpu);
+       kvm_mmu_unload(vcpu);
+       init_kvm_mmu(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 
@@ -3923,6 +3876,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
        mmu_free_roots(vcpu);
+       WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
@@ -4281,12 +4235,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
        return alloc_mmu_pages(vcpu);
 }
 
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-       return init_kvm_mmu(vcpu);
+       init_kvm_mmu(vcpu);
 }
 
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
@@ -4428,7 +4382,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        int nr_to_scan = sc->nr_to_scan;
        unsigned long freed = 0;
 
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
 
        list_for_each_entry(kvm, &vm_list, vm_list) {
                int idx;
@@ -4478,9 +4432,8 @@ unlock:
                break;
        }
 
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
        return freed;
-
 }
 
 static unsigned long
@@ -4574,7 +4527,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
 
-       destroy_kvm_mmu(vcpu);
+       kvm_mmu_unload(vcpu);
        free_mmu_pages(vcpu);
        mmu_free_memory_caches(vcpu);
 }
index 77e044a..2926152 100644 (file)
@@ -70,8 +70,8 @@ enum {
 };
 
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
                bool execonly);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
index c0bc803..c7168a5 100644 (file)
@@ -1959,11 +1959,9 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
        nested_svm_vmexit(svm);
 }
 
-static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-       int r;
-
-       r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+       kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
 
        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
@@ -1971,8 +1969,6 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
        vcpu->arch.mmu.shadow_root_level = get_npt_level();
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-
-       return r;
 }
 
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
index 2b2fce1..b2fe1c2 100644 (file)
@@ -1498,7 +1498,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                        break;
 
        if (i == NR_AUTOLOAD_MSRS) {
-               printk_once(KERN_WARNING"Not enough mst switch entries. "
+               printk_once(KERN_WARNING "Not enough msr switch entries. "
                                "Can't add msr %x\n", msr);
                return;
        } else if (i == m->nr) {
@@ -1898,16 +1898,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
- * This function assumes it is called with the exit reason in vmcs02 being
- * a #PF exception (this is the only case in which KVM injects a #PF when L2
- * is running).
  */
-static int nested_pf_handled(struct kvm_vcpu *vcpu)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-       if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
+       if (!(vmcs12->exception_bitmap & (1u << nr)))
                return 0;
 
        nested_vmx_vmexit(vcpu);
@@ -1921,8 +1917,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-       if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
-           !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
+       if (!reinject && is_guest_mode(vcpu) &&
+           nested_vmx_check_exception(vcpu, nr))
                return;
 
        if (has_error_code) {
@@ -2204,9 +2200,15 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+               VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+       if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+           !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
+               nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+               nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       }
        nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
-                                     VM_EXIT_LOAD_IA32_EFER);
+               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
 
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2226,7 +2228,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
        nested_vmx_procbased_ctls_low = 0;
        nested_vmx_procbased_ctls_high &=
-               CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+               CPU_BASED_VIRTUAL_INTR_PENDING |
+               CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
                CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
                CPU_BASED_CR3_STORE_EXITING |
@@ -2252,13 +2255,15 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               SECONDARY_EXEC_UNRESTRICTED_GUEST |
                SECONDARY_EXEC_WBINVD_EXITING;
 
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
-                        VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+                        VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
+                        VMX_EPT_INVEPT_BIT;
                nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * Since invept is completely emulated we support both global
@@ -3380,8 +3385,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (enable_ept) {
                eptp = construct_eptp(cr3);
                vmcs_write64(EPT_POINTER, eptp);
-               guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
-                       vcpu->kvm->arch.ept_identity_map_addr;
+               if (is_paging(vcpu) || is_guest_mode(vcpu))
+                       guest_cr3 = kvm_read_cr3(vcpu);
+               else
+                       guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
 
@@ -4879,6 +4886,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
 
+static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+{
+       unsigned long always_on = VMXON_CR0_ALWAYSON;
+
+       if (nested_vmx_secondary_ctls_high &
+               SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+           nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+               always_on &= ~(X86_CR0_PE | X86_CR0_PG);
+       return (val & always_on) == always_on;
+}
+
 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
@@ -4897,9 +4915,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
                val = (val & ~vmcs12->cr0_guest_host_mask) |
                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
 
-               /* TODO: will have to take unrestricted guest mode into
-                * account */
-               if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
+               if (!nested_cr0_valid(vmcs12, val))
                        return 1;
 
                if (kvm_set_cr0(vcpu, val))
@@ -6627,6 +6643,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                        return 0;
                else if (is_page_fault(intr_info))
                        return enable_ept;
+               else if (is_no_device(intr_info) &&
+                        !(nested_read_cr0(vmcs12) & X86_CR0_TS))
+                       return 0;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -6722,6 +6741,27 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
+static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       u64 delta_tsc_l1;
+       u32 preempt_val_l1, preempt_val_l2, preempt_scale;
+
+       if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
+                       PIN_BASED_VMX_PREEMPTION_TIMER))
+               return;
+       preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
+                       MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
+       preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+       delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
+               - vcpu->arch.last_guest_tsc;
+       preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
+       if (preempt_val_l2 <= preempt_val_l1)
+               preempt_val_l2 = 0;
+       else
+               preempt_val_l2 -= preempt_val_l1;
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -6736,20 +6776,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
 
-       /*
-        * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
-        * we did not inject a still-pending event to L1 now because of
-        * nested_run_pending, we need to re-enable this bit.
-        */
-       if (vmx->nested.nested_run_pending)
-               kvm_make_request(KVM_REQ_EVENT, vcpu);
-
-       if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
-           exit_reason == EXIT_REASON_VMRESUME))
-               vmx->nested.nested_run_pending = 1;
-       else
-               vmx->nested.nested_run_pending = 0;
-
        if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
                nested_vmx_vmexit(vcpu);
                return 1;
@@ -7061,9 +7087,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
        case INTR_TYPE_HARD_EXCEPTION:
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        u32 err = vmcs_read32(error_code_field);
-                       kvm_queue_exception_e(vcpu, vector, err);
+                       kvm_requeue_exception_e(vcpu, vector, err);
                } else
-                       kvm_queue_exception(vcpu, vector);
+                       kvm_requeue_exception(vcpu, vector);
                break;
        case INTR_TYPE_SOFT_INTR:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
@@ -7146,6 +7172,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
 
+       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
+               nested_adjust_preemption_timer(vcpu);
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -7284,6 +7312,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
        trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
 
+       /*
+        * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+        * we did not inject a still-pending event to L1 now because of
+        * nested_run_pending, we need to re-enable this bit.
+        */
+       if (vmx->nested.nested_run_pending)
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+       vmx->nested.nested_run_pending = 0;
+
        vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
        vmx_complete_interrupts(vmx);
@@ -7410,8 +7448,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
         */
        if (is_mmio)
                ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
-       else if (vcpu->kvm->arch.iommu_domain &&
-               !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
+       else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
                ret = kvm_get_guest_memory_type(vcpu, gfn) <<
                      VMX_EPT_MT_EPTE_SHIFT;
        else
@@ -7501,9 +7538,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
        return get_vmcs12(vcpu)->ept_pointer;
 }
 
-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-       int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+       kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
 
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
@@ -7511,8 +7548,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
 
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-
-       return r;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -7520,6 +7555,20 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 }
 
+static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+               struct x86_exception *fault)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       WARN_ON(!is_guest_mode(vcpu));
+
+       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+       if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+               nested_vmx_vmexit(vcpu);
+       else
+               kvm_inject_page_fault(vcpu, fault);
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7533,6 +7582,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
+       u32 exit_control;
 
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7706,7 +7756,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
         * bits are further modified by vmx_set_efer() below.
         */
-       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+       exit_control = vmcs_config.vmexit_ctrl;
+       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+               exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+       vmcs_write32(VM_EXIT_CONTROLS, exit_control);
 
        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
         * emulated by vmx_set_efer(), below.
@@ -7773,6 +7826,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
 
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
+
        /*
         * L1 may access the L2's PDPTR, so save them to construct vmcs12
         */
@@ -7876,7 +7932,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
 
-       if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+       if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
            ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
                nested_vmx_entry_failure(vcpu, vmcs12,
                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -7938,6 +7994,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
        enter_guest_mode(vcpu);
 
+       vmx->nested.nested_run_pending = 1;
+
        vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
 
        cpu = get_cpu();
@@ -8005,7 +8063,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
        u32 idt_vectoring;
        unsigned int nr;
 
-       if (vcpu->arch.exception.pending) {
+       if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {
                nr = vcpu->arch.exception.nr;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
 
@@ -8023,7 +8081,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
                }
 
                vmcs12->idt_vectoring_info_field = idt_vectoring;
-       } else if (vcpu->arch.nmi_pending) {
+       } else if (vcpu->arch.nmi_injected) {
                vmcs12->idt_vectoring_info_field =
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
        } else if (vcpu->arch.interrupt.pending) {
@@ -8105,6 +8163,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+       if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
+           (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+               vmcs12->vmx_preemption_timer_value =
+                       vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+
        /*
         * In some cases (usually, nested EPT), L2 is allowed to change its
         * own CR3 without exiting. If it has changed it, we must keep it.
@@ -8130,6 +8193,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+               vmcs12->guest_ia32_efer = vcpu->arch.efer;
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
@@ -8201,7 +8266,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         * fpu_active (which may have changed).
         * Note that vmx_set_cr0 refers to efer set above.
         */
-       kvm_set_cr0(vcpu, vmcs12->host_cr0);
+       vmx_set_cr0(vcpu, vmcs12->host_cr0);
        /*
         * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
         * to apply the same changes to L1's vmcs. We just set cr0 correctly,
@@ -8224,6 +8289,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
 
+       if (!enable_ept)
+               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
        if (enable_vpid) {
                /*
                 * Trivially support vpid by letting L2s share their parent
index e5ca72a..21ef1ba 100644 (file)
@@ -577,6 +577,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
        u64 xcr0;
+       u64 valid_bits;
 
        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
        if (index != XCR_XFEATURE_ENABLED_MASK)
@@ -586,8 +587,16 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
                return 1;
        if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
                return 1;
-       if (xcr0 & ~host_xcr0)
+
+       /*
+        * Do not allow the guest to set bits that we do not support
+        * saving.  However, xcr0 bit 0 is always set, even if the
+        * emulated CPU does not support XSAVE (see fx_init).
+        */
+       valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+       if (xcr0 & ~valid_bits)
                return 1;
+
        kvm_put_guest_xcr0(vcpu);
        vcpu->arch.xcr0 = xcr0;
        return 0;
@@ -684,7 +693,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
        vcpu->arch.cr3 = cr3;
        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-       vcpu->arch.mmu.new_cr3(vcpu);
+       kvm_mmu_new_cr3(vcpu);
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -2564,6 +2573,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
        case KVM_CAP_SET_TSS_ADDR:
        case KVM_CAP_EXT_CPUID:
+       case KVM_CAP_EXT_EMUL_CPUID:
        case KVM_CAP_CLOCKSOURCE:
        case KVM_CAP_PIT:
        case KVM_CAP_NOP_IO_DELAY:
@@ -2673,15 +2683,17 @@ long kvm_arch_dev_ioctl(struct file *filp,
                r = 0;
                break;
        }
-       case KVM_GET_SUPPORTED_CPUID: {
+       case KVM_GET_SUPPORTED_CPUID:
+       case KVM_GET_EMULATED_CPUID: {
                struct kvm_cpuid2 __user *cpuid_arg = argp;
                struct kvm_cpuid2 cpuid;
 
                r = -EFAULT;
                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                        goto out;
-               r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
-                                                     cpuid_arg->entries);
+
+               r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+                                           ioctl);
                if (r)
                        goto out;
 
@@ -2715,8 +2727,7 @@ static void wbinvd_ipi(void *garbage)
 
 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
-       return vcpu->kvm->arch.iommu_domain &&
-               !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+       return kvm_arch_has_noncoherent_dma(vcpu->kvm);
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2984,11 +2995,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                                         struct kvm_xsave *guest_xsave)
 {
-       if (cpu_has_xsave)
+       if (cpu_has_xsave) {
                memcpy(guest_xsave->region,
                        &vcpu->arch.guest_fpu.state->xsave,
-                       xstate_size);
-       else {
+                       vcpu->arch.guest_xstate_size);
+               *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &=
+                       vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE;
+       } else {
                memcpy(guest_xsave->region,
                        &vcpu->arch.guest_fpu.state->fxsave,
                        sizeof(struct i387_fxsave_struct));
@@ -3003,10 +3016,19 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
        u64 xstate_bv =
                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
 
-       if (cpu_has_xsave)
+       if (cpu_has_xsave) {
+               /*
+                * Here we allow setting states that are not present in
+                * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
+                * with old userspace.
+                */
+               if (xstate_bv & ~KVM_SUPPORTED_XCR0)
+                       return -EINVAL;
+               if (xstate_bv & ~host_xcr0)
+                       return -EINVAL;
                memcpy(&vcpu->arch.guest_fpu.state->xsave,
-                       guest_xsave->region, xstate_size);
-       else {
+                       guest_xsave->region, vcpu->arch.guest_xstate_size);
+       else {
                if (xstate_bv & ~XSTATE_FPSSE)
                        return -EINVAL;
                memcpy(&vcpu->arch.guest_fpu.state->fxsave,
@@ -3042,9 +3064,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
 
        for (i = 0; i < guest_xcrs->nr_xcrs; i++)
                /* Only support XCR0 currently */
-               if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+               if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
                        r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
-                               guest_xcrs->xcrs[0].value);
+                               guest_xcrs->xcrs[i].value);
                        break;
                }
        if (r)
@@ -4775,8 +4797,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 
 static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
 {
-       memset(&ctxt->twobyte, 0,
-              (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
+       memset(&ctxt->opcode_len, 0,
+              (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
 
        ctxt->fetch.start = 0;
        ctxt->fetch.end = 0;
@@ -5094,8 +5116,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                ctxt->have_exception = false;
                ctxt->perm_ok = false;
 
-               ctxt->only_vendor_specific_insn
-                       = emulation_type & EMULTYPE_TRAP_UD;
+               ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
 
                r = x86_decode_insn(ctxt, insn, insn_len);
 
@@ -5263,7 +5284,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 
        smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                kvm_for_each_vcpu(i, vcpu, kvm) {
                        if (vcpu->cpu != freq->cpu)
@@ -5273,7 +5294,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
                                send_ipi = 1;
                }
        }
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
 
        if (freq->old < freq->new && send_ipi) {
                /*
@@ -5426,12 +5447,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
        struct kvm_vcpu *vcpu;
        int i;
 
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                kvm_for_each_vcpu(i, vcpu, kvm)
                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
        atomic_set(&kvm_guest_has_master_clock, 0);
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
 }
 
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5945,10 +5966,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        vcpu->mode = IN_GUEST_MODE;
 
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+
        /* We should set ->mode before check ->requests,
         * see the comment in make_all_cpus_request.
         */
-       smp_mb();
+       smp_mb__after_srcu_read_unlock();
 
        local_irq_disable();
 
@@ -5958,12 +5981,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                smp_wmb();
                local_irq_enable();
                preempt_enable();
+               vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
                r = 1;
                goto cancel_injection;
        }
 
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-
        if (req_immediate_exit)
                smp_send_reschedule(vcpu->cpu);
 
@@ -6688,7 +6710,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        if (r)
                return r;
        kvm_vcpu_reset(vcpu);
-       r = kvm_mmu_setup(vcpu);
+       kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
 
        return r;
@@ -6940,6 +6962,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
        vcpu->arch.ia32_tsc_adjust_msr = 0x0;
        vcpu->arch.pv_time_enabled = false;
+
+       vcpu->arch.guest_supported_xcr0 = 0;
+       vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
 
@@ -6981,6 +7007,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+       atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
@@ -7065,7 +7092,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
        int i;
@@ -7086,7 +7113,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
        }
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                           unsigned long npages)
 {
        int i;
 
@@ -7283,7 +7311,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
        int r;
 
        if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
-             is_error_page(work->page))
+             work->wakeup_all)
                return;
 
        r = kvm_mmu_reload(vcpu);
@@ -7393,7 +7421,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
        struct x86_exception fault;
 
        trace_kvm_async_pf_ready(work->arch.token, work->gva);
-       if (is_error_page(work->page))
+       if (work->wakeup_all)
                work->arch.token = ~0; /* broadcast wakeup */
        else
                kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
@@ -7420,6 +7448,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
                        kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+       atomic_inc(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
+
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+       atomic_dec(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
+
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+       return atomic_read(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
index e224f7a..587fb9e 100644 (file)
@@ -122,6 +122,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
        gva_t addr, void *val, unsigned int bytes,
        struct x86_exception *exception);
 
+#define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
 extern u64 host_xcr0;
 
 extern struct static_key kvm_no_apic_vcpu;
index dfa537a..a7cccb6 100644 (file)
@@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
        struct page *pte;
 
        pte = alloc_pages(__userpte_alloc_gfp, 0);
-       if (pte)
-               pgtable_page_ctor(pte);
+       if (!pte)
+               return NULL;
+       if (!pgtable_page_ctor(pte)) {
+               __free_page(pte);
+               return NULL;
+       }
        return pte;
 }
 
@@ -189,8 +193,10 @@ static void free_pmds(pmd_t *pmds[])
        int i;
 
        for(i = 0; i < PREALLOCATED_PMDS; i++)
-               if (pmds[i])
+               if (pmds[i]) {
+                       pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
                        free_page((unsigned long)pmds[i]);
+               }
 }
 
 static int preallocate_pmds(pmd_t *pmds[])
@@ -200,8 +206,13 @@ static int preallocate_pmds(pmd_t *pmds[])
 
        for(i = 0; i < PREALLOCATED_PMDS; i++) {
                pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
-               if (pmd == NULL)
+               if (!pmd)
                        failed = true;
+               if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
+                       free_page((unsigned long)pmds[i]);
+                       pmd = NULL;
+                       failed = true;
+               }
                pmds[i] = pmd;
        }
 
index fdc3ba2..ce563be 100644 (file)
@@ -468,8 +468,8 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
  * 3        PCD PWT      UC       UC     UC
  * 4    PAT              WB       WC     WB
  * 5    PAT     PWT      WC       WP     WT
- * 6    PAT PCD          UC-      UC     UC-
- * 7    PAT PCD PWT      UC       UC     UC
+ * 6    PAT PCD          UC-      rsv    UC-
+ * 7    PAT PCD PWT      UC       rsv    UC
  */
 
 void xen_set_pat(u64 pat)
@@ -796,8 +796,8 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 {
        spinlock_t *ptl = NULL;
 
-#if USE_SPLIT_PTLOCKS
-       ptl = __pte_lockptr(page);
+#if USE_SPLIT_PTE_PTLOCKS
+       ptl = ptlock_ptr(page);
        spin_lock_nest_lock(ptl, &mm->page_table_lock);
 #endif
 
@@ -1637,7 +1637,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
 
                        __set_pfn_prot(pfn, PAGE_KERNEL_RO);
 
-                       if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+                       if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
                                __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
 
                        xen_mc_issue(PARAVIRT_LAZY_MMU);
@@ -1671,7 +1671,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
                if (!PageHighMem(page)) {
                        xen_mc_batch();
 
-                       if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+                       if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
                                __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 
                        __set_pfn_prot(pfn, PAGE_KERNEL);
@@ -2328,12 +2328,14 @@ static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
        return success;
 }
 
-int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
-                                unsigned int address_bits)
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+                                unsigned int address_bits,
+                                dma_addr_t *dma_handle)
 {
        unsigned long *in_frames = discontig_frames, out_frame;
        unsigned long  flags;
        int            success;
+       unsigned long vstart = (unsigned long)phys_to_virt(pstart);
 
        /*
         * Currently an auto-translated guest will not perform I/O, nor will
@@ -2368,15 +2370,17 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
 
        spin_unlock_irqrestore(&xen_reservation_lock, flags);
 
+       *dma_handle = virt_to_machine(vstart).maddr;
        return success ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
 
-void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
 {
        unsigned long *out_frames = discontig_frames, in_frame;
        unsigned long  flags;
        int success;
+       unsigned long vstart;
 
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return;
@@ -2384,6 +2388,7 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
        if (unlikely(order > MAX_CONTIG_ORDER))
                return;
 
+       vstart = (unsigned long)phys_to_virt(pstart);
        memset((void *) vstart, 0, PAGE_SIZE << order);
 
        spin_lock_irqsave(&xen_reservation_lock, flags);
index a61c7d5..2ae8699 100644 (file)
@@ -799,10 +799,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
        unsigned topidx, mididx, idx;
 
-       if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
-               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+       /* don't track P2M changes in autotranslate guests */
+       if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
                return true;
-       }
+
        if (unlikely(pfn >= MAX_P2M_PFN)) {
                BUG_ON(mfn != INVALID_P2M_ENTRY);
                return true;
index 9695704..0e98e5d 100644 (file)
@@ -75,8 +75,10 @@ void __init pci_xen_swiotlb_init(void)
                xen_swiotlb_init(1, true /* early */);
                dma_ops = &xen_swiotlb_dma_ops;
 
+#ifdef CONFIG_PCI
                /* Make sure ACS will be enabled */
                pci_request_acs();
+#endif
        }
 }
 
@@ -92,8 +94,10 @@ int pci_xen_swiotlb_init_late(void)
                return rc;
 
        dma_ops = &xen_swiotlb_dma_ops;
+#ifdef CONFIG_PCI
        /* Make sure ACS will be enabled */
        pci_request_acs();
+#endif
 
        return 0;
 }
index 09f3059..68c054f 100644 (file)
@@ -556,7 +556,7 @@ void xen_enable_syscall(void)
        }
 #endif /* CONFIG_X86_64 */
 }
-void __cpuinit xen_enable_nmi(void)
+void xen_enable_nmi(void)
 {
 #ifdef CONFIG_X86_64
        if (register_callback(CALLBACKTYPE_nmi, nmi))
index 31d0475..c36b325 100644 (file)
@@ -149,7 +149,7 @@ static int xen_smp_intr_init(unsigned int cpu)
        rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
                                    cpu,
                                    xen_reschedule_interrupt,
-                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                   IRQF_PERCPU|IRQF_NOBALANCING,
                                    resched_name,
                                    NULL);
        if (rc < 0)
@@ -161,7 +161,7 @@ static int xen_smp_intr_init(unsigned int cpu)
        rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
                                    cpu,
                                    xen_call_function_interrupt,
-                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                   IRQF_PERCPU|IRQF_NOBALANCING,
                                    callfunc_name,
                                    NULL);
        if (rc < 0)
@@ -171,7 +171,7 @@ static int xen_smp_intr_init(unsigned int cpu)
 
        debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
        rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
-                                    IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
+                                    IRQF_PERCPU | IRQF_NOBALANCING,
                                     debug_name, NULL);
        if (rc < 0)
                goto fail;
@@ -182,7 +182,7 @@ static int xen_smp_intr_init(unsigned int cpu)
        rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
                                    cpu,
                                    xen_call_function_single_interrupt,
-                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                   IRQF_PERCPU|IRQF_NOBALANCING,
                                    callfunc_name,
                                    NULL);
        if (rc < 0)
@@ -201,7 +201,7 @@ static int xen_smp_intr_init(unsigned int cpu)
        rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
                                    cpu,
                                    xen_irq_work_interrupt,
-                                   IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                   IRQF_PERCPU|IRQF_NOBALANCING,
                                    callfunc_name,
                                    NULL);
        if (rc < 0)
index be6b860..0e36cde 100644 (file)
@@ -234,7 +234,7 @@ void xen_init_lock_cpu(int cpu)
        irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
                                     cpu,
                                     dummy_handler,
-                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                    IRQF_PERCPU|IRQF_NOBALANCING,
                                     name,
                                     NULL);
 
index ee36589..12a1ca7 100644 (file)
@@ -443,8 +443,7 @@ void xen_setup_timer(int cpu)
                name = "<timer kasprintf failed>";
 
        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
-                                     IRQF_DISABLED|IRQF_PERCPU|
-                                     IRQF_NOBALANCING|IRQF_TIMER|
+                                     IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
                                      IRQF_FORCE_RESUME,
                                      name, NULL);
 
index cf914c8..d38eb92 100644 (file)
@@ -38,35 +38,46 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
        free_page((unsigned long)pgd);
 }
 
-/* Use a slab cache for the pte pages (see also sparc64 implementation) */
-
-extern struct kmem_cache *pgtable_cache;
-
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
                                         unsigned long address)
 {
-       return kmem_cache_alloc(pgtable_cache, GFP_KERNEL|__GFP_REPEAT);
+       pte_t *ptep;
+       int i;
+
+       ptep = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+       if (!ptep)
+               return NULL;
+       for (i = 0; i < 1024; i++)
+               pte_clear(NULL, 0, ptep + i);
+       return ptep;
 }
 
 static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
                                        unsigned long addr)
 {
+       pte_t *pte;
        struct page *page;
 
-       page = virt_to_page(pte_alloc_one_kernel(mm, addr));
-       pgtable_page_ctor(page);
+       pte = pte_alloc_one_kernel(mm, addr);
+       if (!pte)
+               return NULL;
+       page = virt_to_page(pte);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
        return page;
 }
 
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-       kmem_cache_free(pgtable_cache, pte);
+       free_page((unsigned long)pte);
 }
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
 {
        pgtable_page_dtor(pte);
-       kmem_cache_free(pgtable_cache, page_address(pte));
+       __free_page(pte);
 }
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
index 0fdf5d0..2164462 100644 (file)
@@ -220,12 +220,11 @@ extern unsigned long empty_zero_page[1024];
 #ifdef CONFIG_MMU
 extern pgd_t swapper_pg_dir[PAGE_SIZE/sizeof(pgd_t)];
 extern void paging_init(void);
-extern void pgtable_cache_init(void);
 #else
 # define swapper_pg_dir NULL
 static inline void paging_init(void) { }
-static inline void pgtable_cache_init(void) { }
 #endif
+static inline void pgtable_cache_init(void) { }
 
 /*
  * The pmd contains the kernel virtual address of the pte page.
index a107757..c43771c 100644 (file)
@@ -50,23 +50,3 @@ void __init init_mmu(void)
         */
        set_ptevaddr_register(PGTABLE_START);
 }
-
-struct kmem_cache *pgtable_cache __read_mostly;
-
-static void pgd_ctor(void *addr)
-{
-       pte_t *ptep = (pte_t *)addr;
-       int i;
-
-       for (i = 0; i < 1024; i++, ptep++)
-               pte_clear(NULL, 0, ptep);
-
-}
-
-void __init pgtable_cache_init(void)
-{
-       pgtable_cache = kmem_cache_create("pgd",
-                       PAGE_SIZE, PAGE_SIZE,
-                       SLAB_HWCACHE_ALIGN,
-                       pgd_ctor);
-}
index 88d4e86..c661896 100644 (file)
@@ -319,7 +319,7 @@ void __blk_mq_end_io(struct request *rq, int error)
                blk_mq_complete_request(rq, error);
 }
 
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#if defined(CONFIG_SMP)
 
 /*
  * Called with interrupts disabled.
@@ -361,7 +361,7 @@ static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
 
        return true;
 }
-#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+#else /* CONFIG_SMP */
 static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
                          struct request *rq, const int error)
 {
index ce4b8bf..57790c1 100644 (file)
@@ -36,7 +36,7 @@ static void blk_done_softirq(struct softirq_action *h)
        }
 }
 
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
 static void trigger_softirq(void *data)
 {
        struct request *rq = data;
@@ -71,7 +71,7 @@ static int raise_blk_irq(int cpu, struct request *rq)
 
        return 1;
 }
-#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+#else /* CONFIG_SMP */
 static int raise_blk_irq(int cpu, struct request *rq)
 {
        return 1;
index 4f8c4d9..9777952 100644 (file)
@@ -288,7 +288,7 @@ static ssize_t
 queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 {
        ssize_t ret = -EINVAL;
-#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
        unsigned long val;
 
        ret = queue_var_store(&val, page, count);
index ac33d5f..966f893 100644 (file)
@@ -434,7 +434,7 @@ int af_alg_wait_for_completion(int err, struct af_alg_completion *completion)
        case -EINPROGRESS:
        case -EBUSY:
                wait_for_completion(&completion->completion);
-               INIT_COMPLETION(completion->completion);
+               reinit_completion(&completion->completion);
                err = completion->err;
                break;
        };
index 25a5934..1ab8258 100644 (file)
@@ -493,7 +493,7 @@ static inline int do_one_ahash_op(struct ahash_request *req, int ret)
                ret = wait_for_completion_interruptible(&tr->completion);
                if (!ret)
                        ret = tr->err;
-               INIT_COMPLETION(tr->completion);
+               reinit_completion(&tr->completion);
        }
        return ret;
 }
@@ -721,7 +721,7 @@ static inline int do_one_acipher_op(struct ablkcipher_request *req, int ret)
                ret = wait_for_completion_interruptible(&tr->completion);
                if (!ret)
                        ret = tr->err;
-               INIT_COMPLETION(tr->completion);
+               reinit_completion(&tr->completion);
        }
 
        return ret;
index e091ef6..432afc0 100644 (file)
@@ -179,7 +179,7 @@ static int do_one_async_hash_op(struct ahash_request *req,
                ret = wait_for_completion_interruptible(&tr->completion);
                if (!ret)
                        ret = tr->err;
-               INIT_COMPLETION(tr->completion);
+               reinit_completion(&tr->completion);
        }
        return ret;
 }
@@ -336,7 +336,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
                                ret = wait_for_completion_interruptible(
                                        &tresult.completion);
                                if (!ret && !(ret = tresult.err)) {
-                                       INIT_COMPLETION(tresult.completion);
+                                       reinit_completion(&tresult.completion);
                                        break;
                                }
                                /* fall through */
@@ -543,7 +543,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
                                ret = wait_for_completion_interruptible(
                                        &result.completion);
                                if (!ret && !(ret = result.err)) {
-                                       INIT_COMPLETION(result.completion);
+                                       reinit_completion(&result.completion);
                                        break;
                                }
                        case -EBADMSG:
@@ -697,7 +697,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
                                ret = wait_for_completion_interruptible(
                                        &result.completion);
                                if (!ret && !(ret = result.err)) {
-                                       INIT_COMPLETION(result.completion);
+                                       reinit_completion(&result.completion);
                                        break;
                                }
                        case -EBADMSG:
@@ -983,7 +983,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
                                ret = wait_for_completion_interruptible(
                                        &result.completion);
                                if (!ret && !((ret = result.err))) {
-                                       INIT_COMPLETION(result.completion);
+                                       reinit_completion(&result.completion);
                                        break;
                                }
                                /* fall through */
@@ -1086,7 +1086,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
                                ret = wait_for_completion_interruptible(
                                        &result.completion);
                                if (!ret && !((ret = result.err))) {
-                                       INIT_COMPLETION(result.completion);
+                                       reinit_completion(&result.completion);
                                        break;
                                }
                                /* fall through */
index 77bbc82..92d7797 100644 (file)
@@ -3017,7 +3017,7 @@ static inline void ata_eh_pull_park_action(struct ata_port *ap)
         * ourselves at the beginning of each pass over the loop.
         *
         * Additionally, all write accesses to &ap->park_req_pending
-        * through INIT_COMPLETION() (see below) or complete_all()
+        * through reinit_completion() (see below) or complete_all()
         * (see ata_scsi_park_store()) are protected by the host lock.
         * As a result we have that park_req_pending.done is zero on
         * exit from this function, i.e. when ATA_EH_PARK actions for
@@ -3031,7 +3031,7 @@ static inline void ata_eh_pull_park_action(struct ata_port *ap)
         */
 
        spin_lock_irqsave(ap->lock, flags);
-       INIT_COMPLETION(ap->park_req_pending);
+       reinit_completion(&ap->park_req_pending);
        ata_for_each_link(link, ap, EDGE) {
                ata_for_each_dev(dev, link, ALL) {
                        struct ata_eh_info *ehi = &link->eh_info;
index ee039af..c12e9b9 100644 (file)
@@ -757,7 +757,7 @@ void dpm_resume(pm_message_t state)
        async_error = 0;
 
        list_for_each_entry(dev, &dpm_suspended_list, power.entry) {
-               INIT_COMPLETION(dev->power.completion);
+               reinit_completion(&dev->power.completion);
                if (is_async(dev)) {
                        get_device(dev);
                        async_schedule(async_resume, dev);
@@ -1237,7 +1237,7 @@ static void async_suspend(void *data, async_cookie_t cookie)
 
 static int device_suspend(struct device *dev)
 {
-       INIT_COMPLETION(dev->power.completion);
+       reinit_completion(&dev->power.completion);
 
        if (pm_async_enabled && dev->power.async_suspend) {
                get_device(dev);
index 4ff85b8..748dea4 100644 (file)
@@ -343,7 +343,7 @@ static int fd_motor_on(int nr)
                unit[nr].motor = 1;
                fd_select(nr);
 
-               INIT_COMPLETION(motor_on_completion);
+               reinit_completion(&motor_on_completion);
                motor_on_timer.data = nr;
                mod_timer(&motor_on_timer, jiffies + HZ/2);
 
index 0c004ac..b35fc4f 100644 (file)
@@ -2808,7 +2808,7 @@ resend_cmd2:
                /* erase the old error information */
                memset(c->err_info, 0, sizeof(ErrorInfo_struct));
                return_status = IO_OK;
-               INIT_COMPLETION(wait);
+               reinit_completion(&wait);
                goto resend_cmd2;
        }
 
@@ -3669,7 +3669,7 @@ static int add_to_scan_list(struct ctlr_info *h)
                }
        }
        if (!found && !h->busy_scanning) {
-               INIT_COMPLETION(h->scan_wait);
+               reinit_completion(&h->scan_wait);
                list_add_tail(&h->scan_list, &scan_q);
                ret = 1;
        }
index 5cdf88b..f3be496 100644 (file)
@@ -292,6 +292,8 @@ static void virtblk_done(struct virtqueue *vq)
                                req_done = true;
                        }
                }
+               if (unlikely(virtqueue_is_broken(vq)))
+                       break;
        } while (!virtqueue_enable_cb(vq));
        /* In case queue is stopped waiting for more buffers. */
        if (req_done)
@@ -456,18 +458,15 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 {
        struct virtio_blk *vblk = bd->bd_disk->private_data;
-       struct virtio_blk_geometry vgeo;
-       int err;
 
        /* see if the host passed in geometry config */
-       err = virtio_config_val(vblk->vdev, VIRTIO_BLK_F_GEOMETRY,
-                               offsetof(struct virtio_blk_config, geometry),
-                               &vgeo);
-
-       if (!err) {
-               geo->heads = vgeo.heads;
-               geo->sectors = vgeo.sectors;
-               geo->cylinders = vgeo.cylinders;
+       if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
+               virtio_cread(vblk->vdev, struct virtio_blk_config,
+                            geometry.cylinders, &geo->cylinders);
+               virtio_cread(vblk->vdev, struct virtio_blk_config,
+                            geometry.heads, &geo->heads);
+               virtio_cread(vblk->vdev, struct virtio_blk_config,
+                            geometry.sectors, &geo->sectors);
        } else {
                /* some standard values, similar to sd */
                geo->heads = 1 << 6;
@@ -529,8 +528,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
                goto done;
 
        /* Host must always specify the capacity. */
-       vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
-                         &capacity, sizeof(capacity));
+       virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
 
        /* If capacity is too big, truncate with warning. */
        if ((sector_t)capacity != capacity) {
@@ -608,9 +606,9 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev)
        u8 writeback;
        int err;
 
-       err = virtio_config_val(vdev, VIRTIO_BLK_F_CONFIG_WCE,
-                               offsetof(struct virtio_blk_config, wce),
-                               &writeback);
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
+                                  struct virtio_blk_config, wce,
+                                  &writeback);
        if (err)
                writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
 
@@ -642,7 +640,6 @@ virtblk_cache_type_store(struct device *dev, struct device_attribute *attr,
        struct virtio_blk *vblk = disk->private_data;
        struct virtio_device *vdev = vblk->vdev;
        int i;
-       u8 writeback;
 
        BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
        for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; )
@@ -652,11 +649,7 @@ virtblk_cache_type_store(struct device *dev, struct device_attribute *attr,
        if (i < 0)
                return -EINVAL;
 
-       writeback = i;
-       vdev->config->set(vdev,
-                         offsetof(struct virtio_blk_config, wce),
-                         &writeback, sizeof(writeback));
-
+       virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
        virtblk_update_cache_mode(vdev);
        return count;
 }
@@ -699,9 +692,9 @@ static int virtblk_probe(struct virtio_device *vdev)
        index = err;
 
        /* We need to know how many segments before we allocate. */
-       err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX,
-                               offsetof(struct virtio_blk_config, seg_max),
-                               &sg_elems);
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
+                                  struct virtio_blk_config, seg_max,
+                                  &sg_elems);
 
        /* We need at least one SG element, whatever they say. */
        if (err || !sg_elems)
@@ -772,8 +765,7 @@ static int virtblk_probe(struct virtio_device *vdev)
                set_disk_ro(vblk->disk, 1);
 
        /* Host must always specify the capacity. */
-       vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
-                         &cap, sizeof(cap));
+       virtio_cread(vdev, struct virtio_blk_config, capacity, &cap);
 
        /* If capacity is too big, truncate with warning. */
        if ((sector_t)cap != cap) {
@@ -794,46 +786,45 @@ static int virtblk_probe(struct virtio_device *vdev)
 
        /* Host can optionally specify maximum segment size and number of
         * segments. */
-       err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX,
-                               offsetof(struct virtio_blk_config, size_max),
-                               &v);
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
+                                  struct virtio_blk_config, size_max, &v);
        if (!err)
                blk_queue_max_segment_size(q, v);
        else
                blk_queue_max_segment_size(q, -1U);
 
        /* Host can optionally specify the block size of the device */
-       err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE,
-                               offsetof(struct virtio_blk_config, blk_size),
-                               &blk_size);
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
+                                  struct virtio_blk_config, blk_size,
+                                  &blk_size);
        if (!err)
                blk_queue_logical_block_size(q, blk_size);
        else
                blk_size = queue_logical_block_size(q);
 
        /* Use topology information if available */
-       err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
-                       offsetof(struct virtio_blk_config, physical_block_exp),
-                       &physical_block_exp);
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+                                  struct virtio_blk_config, physical_block_exp,
+                                  &physical_block_exp);
        if (!err && physical_block_exp)
                blk_queue_physical_block_size(q,
                                blk_size * (1 << physical_block_exp));
 
-       err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
-                       offsetof(struct virtio_blk_config, alignment_offset),
-                       &alignment_offset);
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+                                  struct virtio_blk_config, alignment_offset,
+                                  &alignment_offset);
        if (!err && alignment_offset)
                blk_queue_alignment_offset(q, blk_size * alignment_offset);
 
-       err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
-                       offsetof(struct virtio_blk_config, min_io_size),
-                       &min_io_size);
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+                                  struct virtio_blk_config, min_io_size,
+                                  &min_io_size);
        if (!err && min_io_size)
                blk_queue_io_min(q, blk_size * min_io_size);
 
-       err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
-                       offsetof(struct virtio_blk_config, opt_io_size),
-                       &opt_io_size);
+       err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+                                  struct virtio_blk_config, opt_io_size,
+                                  &opt_io_size);
        if (!err && opt_io_size)
                blk_queue_io_opt(q, blk_size * opt_io_size);
 
@@ -899,7 +890,7 @@ static void virtblk_remove(struct virtio_device *vdev)
                ida_simple_remove(&vd_index_ida, index);
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int virtblk_freeze(struct virtio_device *vdev)
 {
        struct virtio_blk *vblk = vdev->priv;
@@ -959,7 +950,7 @@ static struct virtio_driver virtio_blk = {
        .probe                  = virtblk_probe,
        .remove                 = virtblk_remove,
        .config_changed         = virtblk_config_changed,
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
        .freeze                 = virtblk_freeze,
        .restore                = virtblk_restore,
 #endif
index d2120ba..73ce739 100644 (file)
@@ -79,7 +79,7 @@ static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data)
        priv->expires = cur + delay;
        priv->present = 0;
 
-       INIT_COMPLETION(priv->completion);
+       reinit_completion(&priv->completion);
        mod_timer(&priv->timer, priv->expires);
 
        return 4;
index ef46a9c..c12398d 100644 (file)
@@ -133,7 +133,7 @@ static void virtrng_remove(struct virtio_device *vdev)
        remove_common(vdev);
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int virtrng_freeze(struct virtio_device *vdev)
 {
        remove_common(vdev);
@@ -157,7 +157,7 @@ static struct virtio_driver virtio_rng_driver = {
        .id_table =     id_table,
        .probe =        virtrng_probe,
        .remove =       virtrng_remove,
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
        .freeze =       virtrng_freeze,
        .restore =      virtrng_restore,
 #endif
index b79cf3e..feea87c 100644 (file)
@@ -577,7 +577,8 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
        spin_lock(&portdev->c_ovq_lock);
        if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
                virtqueue_kick(vq);
-               while (!virtqueue_get_buf(vq, &len))
+               while (!virtqueue_get_buf(vq, &len)
+                       && !virtqueue_is_broken(vq))
                        cpu_relax();
        }
        spin_unlock(&portdev->c_ovq_lock);
@@ -650,7 +651,8 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
         * we need to kmalloc a GFP_ATOMIC buffer each time the
         * console driver writes something out.
         */
-       while (!virtqueue_get_buf(out_vq, &len))
+       while (!virtqueue_get_buf(out_vq, &len)
+               && !virtqueue_is_broken(out_vq))
                cpu_relax();
 done:
        spin_unlock_irqrestore(&port->outvq_lock, flags);
@@ -1837,12 +1839,8 @@ static void config_intr(struct virtio_device *vdev)
                struct port *port;
                u16 rows, cols;
 
-               vdev->config->get(vdev,
-                                 offsetof(struct virtio_console_config, cols),
-                                 &cols, sizeof(u16));
-               vdev->config->get(vdev,
-                                 offsetof(struct virtio_console_config, rows),
-                                 &rows, sizeof(u16));
+               virtio_cread(vdev, struct virtio_console_config, cols, &cols);
+               virtio_cread(vdev, struct virtio_console_config, rows, &rows);
 
                port = find_port_by_id(portdev, 0);
                set_console_size(port, rows, cols);
@@ -2014,10 +2012,9 @@ static int virtcons_probe(struct virtio_device *vdev)
 
        /* Don't test MULTIPORT at all if we're rproc: not a valid feature! */
        if (!is_rproc_serial(vdev) &&
-           virtio_config_val(vdev, VIRTIO_CONSOLE_F_MULTIPORT,
-                                 offsetof(struct virtio_console_config,
-                                          max_nr_ports),
-                                 &portdev->config.max_nr_ports) == 0) {
+           virtio_cread_feature(vdev, VIRTIO_CONSOLE_F_MULTIPORT,
+                                struct virtio_console_config, max_nr_ports,
+                                &portdev->config.max_nr_ports) == 0) {
                multiport = true;
        }
 
@@ -2142,7 +2139,7 @@ static struct virtio_device_id rproc_serial_id_table[] = {
 static unsigned int rproc_serial_features[] = {
 };
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int virtcons_freeze(struct virtio_device *vdev)
 {
        struct ports_device *portdev;
@@ -2220,7 +2217,7 @@ static struct virtio_driver virtio_console = {
        .probe =        virtcons_probe,
        .remove =       virtcons_remove,
        .config_changed = config_intr,
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
        .freeze =       virtcons_freeze,
        .restore =      virtcons_restore,
 #endif
index 2d58da9..fa05e3c 100644 (file)
@@ -268,7 +268,7 @@ static int aes_start_crypt(struct tegra_aes_dev *dd, u32 in_addr, u32 out_addr,
        aes_writel(dd, value, TEGRA_AES_SECURE_INPUT_SELECT);
 
        aes_writel(dd, out_addr, TEGRA_AES_SECURE_DEST_ADDR);
-       INIT_COMPLETION(dd->op_complete);
+       reinit_completion(&dd->op_complete);
 
        for (i = 0; i < AES_HW_MAX_ICQ_LENGTH - 1; i++) {
                do {
index e5af0e3..0e79951 100644 (file)
@@ -477,7 +477,7 @@ void fw_send_phy_config(struct fw_card *card,
        phy_config_packet.header[1] = data;
        phy_config_packet.header[2] = ~data;
        phy_config_packet.generation = generation;
-       INIT_COMPLETION(phy_config_done);
+       reinit_completion(&phy_config_done);
 
        card->driver->send_request(card, &phy_config_packet);
        wait_for_completion_timeout(&phy_config_done, timeout);
index e788882..f9c7fa3 100644 (file)
@@ -34,7 +34,7 @@
  */
 void drm_flip_work_queue(struct drm_flip_work *work, void *val)
 {
-       if (kfifo_put(&work->fifo, (const void **)&val)) {
+       if (kfifo_put(&work->fifo, val)) {
                atomic_inc(&work->pending);
        } else {
                DRM_ERROR("%s fifo full!\n", work->name);
index 1eb86c7..e281070 100644 (file)
@@ -99,7 +99,7 @@ static int xfer_read(struct i2c_adapter *adap, struct i2c_msg *pmsg)
        i2c_dev->status = I2C_STAT_INIT;
        i2c_dev->msg = pmsg;
        i2c_dev->buf_offset = 0;
-       INIT_COMPLETION(i2c_dev->complete);
+       reinit_completion(&i2c_dev->complete);
 
        /* Enable I2C transaction */
        temp = ((pmsg->len) << 20) | HI2C_EDID_READ | HI2C_ENABLE_TRANSACTION;
index 75db0c4..cfa63b0 100644 (file)
@@ -327,7 +327,7 @@ static inline void wiimote_cmd_acquire_noint(struct wiimote_data *wdata)
 static inline void wiimote_cmd_set(struct wiimote_data *wdata, int cmd,
                                                                __u32 opt)
 {
-       INIT_COMPLETION(wdata->state.ready);
+       reinit_completion(&wdata->state.ready);
        wdata->state.cmd = cmd;
        wdata->state.opt = opt;
 }
index e0d66b9..a183e48 100644 (file)
@@ -66,7 +66,7 @@ static ssize_t jz4740_hwmon_read_adcin(struct device *dev,
 
        mutex_lock(&hwmon->lock);
 
-       INIT_COMPLETION(*completion);
+       reinit_completion(completion);
 
        enable_irq(hwmon->irq);
        hwmon->cell->enable(to_platform_device(dev));
index fd05930..8edba9d 100644 (file)
@@ -371,7 +371,7 @@ static int at91_do_twi_transfer(struct at91_twi_dev *dev)
        dev_dbg(dev->dev, "transfer: %s %d bytes.\n",
                (dev->msg->flags & I2C_M_RD) ? "read" : "write", dev->buf_len);
 
-       INIT_COMPLETION(dev->cmd_complete);
+       reinit_completion(&dev->cmd_complete);
        dev->transfer_status = 0;
 
        if (!dev->buf_len) {
index ea4b08f..d7e8600 100644 (file)
@@ -151,7 +151,7 @@ static int bcm2835_i2c_xfer_msg(struct bcm2835_i2c_dev *i2c_dev,
 
        i2c_dev->msg_buf = msg->buf;
        i2c_dev->msg_buf_remaining = msg->len;
-       INIT_COMPLETION(i2c_dev->completion);
+       reinit_completion(&i2c_dev->completion);
 
        bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_C, BCM2835_I2C_C_CLEAR);
 
index 132369f..960dec6 100644 (file)
@@ -323,7 +323,7 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop)
 
        davinci_i2c_write_reg(dev, DAVINCI_I2C_CNT_REG, dev->buf_len);
 
-       INIT_COMPLETION(dev->cmd_complete);
+       reinit_completion(&dev->cmd_complete);
        dev->cmd_err = 0;
 
        /* Take I2C out of reset and configure it as master */
index 5888fee..e89e3e2 100644 (file)
@@ -613,7 +613,7 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
        mutex_lock(&dev->lock);
        pm_runtime_get_sync(dev->dev);
 
-       INIT_COMPLETION(dev->cmd_complete);
+       reinit_completion(&dev->cmd_complete);
        dev->msgs = msgs;
        dev->msgs_num = num;
        dev->cmd_err = 0;
index 1672eff..0043ede 100644 (file)
@@ -541,7 +541,7 @@ static int ismt_access(struct i2c_adapter *adap, u16 addr,
                desc->dptr_high = upper_32_bits(dma_addr);
        }
 
-       INIT_COMPLETION(priv->cmp);
+       reinit_completion(&priv->cmp);
 
        /* Add the descriptor */
        ismt_submit_desc(priv);
index b7c8577..3aedd86 100644 (file)
@@ -505,7 +505,7 @@ static int mxs_i2c_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg,
                                return err;
                }
        } else {
-               INIT_COMPLETION(i2c->cmd_complete);
+               reinit_completion(&i2c->cmd_complete);
                ret = mxs_i2c_dma_setup_xfer(adap, msg, flags);
                if (ret)
                        return ret;
index 9967a6f..a6a891d 100644 (file)
@@ -543,7 +543,7 @@ static int omap_i2c_xfer_msg(struct i2c_adapter *adap,
        w |= OMAP_I2C_BUF_RXFIF_CLR | OMAP_I2C_BUF_TXFIF_CLR;
        omap_i2c_write_reg(dev, OMAP_I2C_BUF_REG, w);
 
-       INIT_COMPLETION(dev->cmd_complete);
+       reinit_completion(&dev->cmd_complete);
        dev->cmd_err = 0;
 
        w = OMAP_I2C_CON_EN | OMAP_I2C_CON_MST | OMAP_I2C_CON_STT;
index c457cb4..e661ede 100644 (file)
@@ -544,7 +544,7 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
        i2c_dev->msg_buf_remaining = msg->len;
        i2c_dev->msg_err = I2C_ERR_NONE;
        i2c_dev->msg_read = (msg->flags & I2C_M_RD);
-       INIT_COMPLETION(i2c_dev->msg_complete);
+       reinit_completion(&i2c_dev->msg_complete);
 
        packet_header = (0 << PACKET_HEADER0_HEADER_SIZE_SHIFT) |
                        PACKET_HEADER0_PROTOCOL_I2C |
index c65da3d..31395fa 100644 (file)
@@ -158,7 +158,7 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg,
                writew(val, i2c_dev->base + REG_CR);
        }
 
-       INIT_COMPLETION(i2c_dev->complete);
+       reinit_completion(&i2c_dev->complete);
 
        if (i2c_dev->mode == I2C_MODE_STANDARD)
                tcr_val = TCR_STANDARD_MODE;
@@ -247,7 +247,7 @@ static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg,
                writew(val, i2c_dev->base + REG_CR);
        }
 
-       INIT_COMPLETION(i2c_dev->complete);
+       reinit_completion(&i2c_dev->complete);
 
        if (i2c_dev->mode == I2C_MODE_STANDARD)
                tcr_val = TCR_STANDARD_MODE;
index e6fbd3e..9a4e0e3 100644 (file)
@@ -188,7 +188,7 @@ static int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta,
 
        spi_bus_lock(sigma_delta->spi->master);
        sigma_delta->bus_locked = true;
-       INIT_COMPLETION(sigma_delta->completion);
+       reinit_completion(&sigma_delta->completion);
 
        ret = ad_sigma_delta_set_mode(sigma_delta, mode);
        if (ret < 0)
@@ -259,7 +259,7 @@ int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev,
 
        spi_bus_lock(sigma_delta->spi->master);
        sigma_delta->bus_locked = true;
-       INIT_COMPLETION(sigma_delta->completion);
+       reinit_completion(&sigma_delta->completion);
 
        ad_sigma_delta_set_mode(sigma_delta, AD_SD_MODE_SINGLE);
 
@@ -343,7 +343,7 @@ static int ad_sd_buffer_postdisable(struct iio_dev *indio_dev)
 {
        struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
 
-       INIT_COMPLETION(sigma_delta->completion);
+       reinit_completion(&sigma_delta->completion);
        wait_for_completion_timeout(&sigma_delta->completion, HZ);
 
        if (!sigma_delta->irq_dis) {
index 54c5bab..e525aa6 100644 (file)
@@ -190,7 +190,7 @@ static int nau7802_read_irq(struct iio_dev *indio_dev,
        struct nau7802_state *st = iio_priv(indio_dev);
        int ret;
 
-       INIT_COMPLETION(st->value_ok);
+       reinit_completion(&st->value_ok);
        enable_irq(st->client->irq);
 
        nau7802_sync(st);
index dac15b9..c10eab6 100644 (file)
@@ -56,7 +56,7 @@ int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp)
                ev.id = ev_code;
                ev.timestamp = timestamp;
 
-               copied = kfifo_put(&ev_int->det_events, &ev);
+               copied = kfifo_put(&ev_int->det_events, ev);
                if (copied != 0)
                        wake_up_locked_poll(&ev_int->wait, POLLIN);
        }
index d53e0b7..4204841 100644 (file)
@@ -242,7 +242,7 @@ static int cyttsp_soft_reset(struct cyttsp *ts)
        int retval;
 
        /* wait for interrupt to set ready completion */
-       INIT_COMPLETION(ts->bl_ready);
+       reinit_completion(&ts->bl_ready);
        ts->state = CY_BL_STATE;
 
        enable_irq(ts->irq);
index c880eba..9fd51e5 100644 (file)
@@ -206,7 +206,7 @@ config SHMOBILE_IPMMU_TLB
 config SHMOBILE_IOMMU
        bool "IOMMU for Renesas IPMMU/IPMMUI"
        default n
-       depends on (ARM && ARCH_SHMOBILE)
+       depends on ARM || COMPILE_TEST
        select IOMMU_API
        select ARM_DMA_USE_IOMMU
        select SHMOBILE_IPMMU
index 14c1f47..5d58bf1 100644 (file)
@@ -1,4 +1,5 @@
 obj-$(CONFIG_IOMMU_API) += iommu.o
+obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_OF_IOMMU) += of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
 obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
index 181c9ba..1abfb56 100644 (file)
@@ -590,6 +590,9 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
                ret = IRQ_HANDLED;
                resume = RESUME_RETRY;
        } else {
+               dev_err_ratelimited(smmu->dev,
+                   "Unhandled context fault: iova=0x%08lx, fsynr=0x%x, cb=%d\n",
+                   iova, fsynr, root_cfg->cbndx);
                ret = IRQ_NONE;
                resume = RESUME_TERMINATE;
        }
@@ -778,7 +781,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain)
 #ifdef __BIG_ENDIAN
        reg |= SCTLR_E;
 #endif
-       writel(reg, cb_base + ARM_SMMU_CB_SCTLR);
+       writel_relaxed(reg, cb_base + ARM_SMMU_CB_SCTLR);
 }
 
 static int arm_smmu_init_domain_context(struct iommu_domain *domain,
@@ -1212,7 +1215,10 @@ static int arm_smmu_alloc_init_pte(struct arm_smmu_device *smmu, pmd_t *pmd,
 
                arm_smmu_flush_pgtable(smmu, page_address(table),
                                       ARM_SMMU_PTE_HWTABLE_SIZE);
-               pgtable_page_ctor(table);
+               if (!pgtable_page_ctor(table)) {
+                       __free_page(table);
+                       return -ENOMEM;
+               }
                pmd_populate(NULL, pmd, table);
                arm_smmu_flush_pgtable(smmu, pmd, sizeof(*pmd));
        }
@@ -1559,9 +1565,13 @@ static struct iommu_ops arm_smmu_ops = {
 static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
 {
        void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
-       void __iomem *sctlr_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB_SCTLR;
+       void __iomem *cb_base;
        int i = 0;
-       u32 scr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sCR0);
+       u32 reg;
+
+       /* Clear Global FSR */
+       reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR);
+       writel(reg, gr0_base + ARM_SMMU_GR0_sGFSR);
 
        /* Mark all SMRn as invalid and all S2CRn as bypass */
        for (i = 0; i < smmu->num_mapping_groups; ++i) {
@@ -1569,33 +1579,38 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
                writel_relaxed(S2CR_TYPE_BYPASS, gr0_base + ARM_SMMU_GR0_S2CR(i));
        }
 
-       /* Make sure all context banks are disabled */
-       for (i = 0; i < smmu->num_context_banks; ++i)
-               writel_relaxed(0, sctlr_base + ARM_SMMU_CB(smmu, i));
+       /* Make sure all context banks are disabled and clear CB_FSR  */
+       for (i = 0; i < smmu->num_context_banks; ++i) {
+               cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, i);
+               writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
+               writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR);
+       }
 
        /* Invalidate the TLB, just in case */
        writel_relaxed(0, gr0_base + ARM_SMMU_GR0_STLBIALL);
        writel_relaxed(0, gr0_base + ARM_SMMU_GR0_TLBIALLH);
        writel_relaxed(0, gr0_base + ARM_SMMU_GR0_TLBIALLNSNH);
 
+       reg = readl_relaxed(gr0_base + ARM_SMMU_GR0_sCR0);
+
        /* Enable fault reporting */
-       scr0 |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE);
+       reg |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE);
 
        /* Disable TLB broadcasting. */
-       scr0 |= (sCR0_VMIDPNE | sCR0_PTM);
+       reg |= (sCR0_VMIDPNE | sCR0_PTM);
 
        /* Enable client access, but bypass when no mapping is found */
-       scr0 &= ~(sCR0_CLIENTPD | sCR0_USFCFG);
+       reg &= ~(sCR0_CLIENTPD | sCR0_USFCFG);
 
        /* Disable forced broadcasting */
-       scr0 &= ~sCR0_FB;
+       reg &= ~sCR0_FB;
 
        /* Don't upgrade barriers */
-       scr0 &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT);
+       reg &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT);
 
        /* Push the button */
        arm_smmu_tlb_sync(smmu);
-       writel(scr0, gr0_base + ARM_SMMU_GR0_sCR0);
+       writel_relaxed(reg, gr0_base + ARM_SMMU_GR0_sCR0);
 }
 
 static int arm_smmu_id_size_to_bits(int size)
@@ -1700,13 +1715,12 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
        id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID1);
        smmu->pagesize = (id & ID1_PAGESIZE) ? SZ_64K : SZ_4K;
 
-       /* Check that we ioremapped enough */
+       /* Check for size mismatch of SMMU address space from mapped region */
        size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1);
        size *= (smmu->pagesize << 1);
-       if (smmu->size < size)
-               dev_warn(smmu->dev,
-                        "device is 0x%lx bytes but only mapped 0x%lx!\n",
-                        size, smmu->size);
+       if (smmu->size != size)
+               dev_warn(smmu->dev, "SMMU address space size (0x%lx) differs "
+                       "from mapped region size (0x%lx)!\n", size, smmu->size);
 
        smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) &
                                      ID1_NUMS2CB_MASK;
@@ -1781,15 +1795,10 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev)
        smmu->dev = dev;
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (!res) {
-               dev_err(dev, "missing base address/size\n");
-               return -ENODEV;
-       }
-
+       smmu->base = devm_ioremap_resource(dev, res);
+       if (IS_ERR(smmu->base))
+               return PTR_ERR(smmu->base);
        smmu->size = resource_size(res);
-       smmu->base = devm_request_and_ioremap(dev, res);
-       if (!smmu->base)
-               return -EADDRNOTAVAIL;
 
        if (of_property_read_u32(dev->of_node, "#global-interrupts",
                                 &smmu->num_global_irqs)) {
@@ -1804,12 +1813,11 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev)
                        smmu->num_context_irqs++;
        }
 
-       if (num_irqs < smmu->num_global_irqs) {
-               dev_warn(dev, "found %d interrupts but expected at least %d\n",
-                        num_irqs, smmu->num_global_irqs);
-               smmu->num_global_irqs = num_irqs;
+       if (!smmu->num_context_irqs) {
+               dev_err(dev, "found %d interrupts but expected at least %d\n",
+                       num_irqs, smmu->num_global_irqs + 1);
+               return -ENODEV;
        }
-       smmu->num_context_irqs = num_irqs - smmu->num_global_irqs;
 
        smmu->irqs = devm_kzalloc(dev, sizeof(*smmu->irqs) * num_irqs,
                                  GFP_KERNEL);
@@ -1933,7 +1941,7 @@ static int arm_smmu_device_remove(struct platform_device *pdev)
                free_irq(smmu->irqs[i], smmu);
 
        /* Turn the thing off */
-       writel(sCR0_CLIENTPD, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_sCR0);
+       writel_relaxed(sCR0_CLIENTPD, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_sCR0);
        return 0;
 }
 
@@ -1981,7 +1989,7 @@ static void __exit arm_smmu_exit(void)
        return platform_driver_unregister(&arm_smmu_driver);
 }
 
-module_init(arm_smmu_init);
+subsys_initcall(arm_smmu_init);
 module_exit(arm_smmu_exit);
 
 MODULE_DESCRIPTION("IOMMU API for ARM architected SMMU implementations");
index 9009469..8b452c9 100644 (file)
@@ -403,7 +403,7 @@ dmar_find_matched_drhd_unit(struct pci_dev *dev)
 
        dev = pci_physfn(dev);
 
-       list_for_each_entry(dmaru, &dmar_drhd_units, list) {
+       for_each_drhd_unit(dmaru) {
                drhd = container_of(dmaru->hdr,
                                    struct acpi_dmar_hardware_unit,
                                    header);
index 15e9b57..43b9bfe 100644 (file)
@@ -782,7 +782,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
        int offset;
 
        BUG_ON(!domain->pgd);
-       BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
+
+       if (addr_width < BITS_PER_LONG && pfn >> addr_width)
+               /* Address beyond IOMMU's addressing capabilities. */
+               return NULL;
+
        parent = domain->pgd;
 
        while (level > 0) {
@@ -3777,11 +3781,10 @@ static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
 static void domain_remove_one_dev_info(struct dmar_domain *domain,
                                          struct pci_dev *pdev)
 {
-       struct device_domain_info *info;
+       struct device_domain_info *info, *tmp;
        struct intel_iommu *iommu;
        unsigned long flags;
        int found = 0;
-       struct list_head *entry, *tmp;
 
        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
                                pdev->devfn);
@@ -3789,8 +3792,7 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain,
                return;
 
        spin_lock_irqsave(&device_domain_lock, flags);
-       list_for_each_safe(entry, tmp, &domain->devices) {
-               info = list_entry(entry, struct device_domain_info, link);
+       list_for_each_entry_safe(info, tmp, &domain->devices, link) {
                if (info->segment == pci_domain_nr(pdev->bus) &&
                    info->bus == pdev->bus->number &&
                    info->devfn == pdev->devfn) {
index ab86902..bab10b1 100644 (file)
@@ -525,12 +525,13 @@ static int __init intel_irq_remapping_supported(void)
        if (disable_irq_remap)
                return 0;
        if (irq_remap_broken) {
-               WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
-                          "This system BIOS has enabled interrupt remapping\n"
-                          "on a chipset that contains an erratum making that\n"
-                          "feature unstable.  To maintain system stability\n"
-                          "interrupt remapping is being disabled.  Please\n"
-                          "contact your BIOS vendor for an update\n");
+               printk(KERN_WARNING
+                       "This system BIOS has enabled interrupt remapping\n"
+                       "on a chipset that contains an erratum making that\n"
+                       "feature unstable.  To maintain system stability\n"
+                       "interrupt remapping is being disabled.  Please\n"
+                       "contact your BIOS vendor for an update\n");
+               add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
                disable_irq_remap = 1;
                return 0;
        }
diff --git a/drivers/iommu/iommu-traces.c b/drivers/iommu/iommu-traces.c
new file mode 100644 (file)
index 0000000..bf3b317
--- /dev/null
@@ -0,0 +1,27 @@
+/*
+ * iommu trace points
+ *
+ * Copyright (C) 2013 Shuah Khan <shuah.kh@samsung.com>
+ *
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/iommu.h>
+
+/* iommu_group_event */
+EXPORT_TRACEPOINT_SYMBOL_GPL(add_device_to_group);
+EXPORT_TRACEPOINT_SYMBOL_GPL(remove_device_from_group);
+
+/* iommu_device_event */
+EXPORT_TRACEPOINT_SYMBOL_GPL(attach_device_to_domain);
+EXPORT_TRACEPOINT_SYMBOL_GPL(detach_device_from_domain);
+
+/* iommu_map_unmap */
+EXPORT_TRACEPOINT_SYMBOL_GPL(map);
+EXPORT_TRACEPOINT_SYMBOL_GPL(unmap);
+
+/* iommu_error */
+EXPORT_TRACEPOINT_SYMBOL_GPL(io_page_fault);
index fbe9ca7..e5555fc 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/idr.h>
 #include <linux/notifier.h>
 #include <linux/err.h>
+#include <trace/events/iommu.h>
 
 static struct kset *iommu_group_kset;
 static struct ida iommu_group_ida;
@@ -363,6 +364,8 @@ rename:
        /* Notify any listeners about change to group. */
        blocking_notifier_call_chain(&group->notifier,
                                     IOMMU_GROUP_NOTIFY_ADD_DEVICE, dev);
+
+       trace_add_device_to_group(group->id, dev);
        return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_group_add_device);
@@ -399,6 +402,8 @@ void iommu_group_remove_device(struct device *dev)
        sysfs_remove_link(group->devices_kobj, device->name);
        sysfs_remove_link(&dev->kobj, "iommu_group");
 
+       trace_remove_device_from_group(group->id, dev);
+
        kfree(device->name);
        kfree(device);
        dev->iommu_group = NULL;
@@ -680,10 +685,14 @@ EXPORT_SYMBOL_GPL(iommu_domain_free);
 
 int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 {
+       int ret;
        if (unlikely(domain->ops->attach_dev == NULL))
                return -ENODEV;
 
-       return domain->ops->attach_dev(domain, dev);
+       ret = domain->ops->attach_dev(domain, dev);
+       if (!ret)
+               trace_attach_device_to_domain(dev);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
@@ -693,6 +702,7 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
                return;
 
        domain->ops->detach_dev(domain, dev);
+       trace_detach_device_from_domain(dev);
 }
 EXPORT_SYMBOL_GPL(iommu_detach_device);
 
@@ -807,17 +817,17 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
         * size of the smallest page supported by the hardware
         */
        if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
-               pr_err("unaligned: iova 0x%lx pa 0x%pa size 0x%zx min_pagesz 0x%x\n",
+               pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n",
                       iova, &paddr, size, min_pagesz);
                return -EINVAL;
        }
 
-       pr_debug("map: iova 0x%lx pa 0x%pa size 0x%zx\n", iova, &paddr, size);
+       pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
 
        while (size) {
                size_t pgsize = iommu_pgsize(domain, iova | paddr, size);
 
-               pr_debug("mapping: iova 0x%lx pa 0x%pa pgsize 0x%zx\n",
+               pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n",
                         iova, &paddr, pgsize);
 
                ret = domain->ops->map(domain, iova, paddr, pgsize, prot);
@@ -832,6 +842,8 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
        /* unroll mapping in case something went wrong */
        if (ret)
                iommu_unmap(domain, orig_iova, orig_size - size);
+       else
+               trace_map(iova, paddr, size);
 
        return ret;
 }
@@ -880,6 +892,7 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
                unmapped += unmapped_page;
        }
 
+       trace_unmap(iova, 0, size);
        return unmapped;
 }
 EXPORT_SYMBOL_GPL(iommu_unmap);
index 108c0e9..dba1a9f 100644 (file)
@@ -252,7 +252,7 @@ static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
        spin_lock_irqsave(&gart->pte_lock, flags);
        pfn = __phys_to_pfn(pa);
        if (!pfn_valid(pfn)) {
-               dev_err(gart->dev, "Invalid page: %08x\n", pa);
+               dev_err(gart->dev, "Invalid page: %pa\n", &pa);
                spin_unlock_irqrestore(&gart->pte_lock, flags);
                return -EINVAL;
        }
@@ -295,8 +295,8 @@ static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain,
 
        pa = (pte & GART_PAGE_MASK);
        if (!pfn_valid(__phys_to_pfn(pa))) {
-               dev_err(gart->dev, "No entry for %08llx:%08x\n",
-                        (unsigned long long)iova, pa);
+               dev_err(gart->dev, "No entry for %08llx:%pa\n",
+                        (unsigned long long)iova, &pa);
                gart_dump_table(gart);
                return -EINVAL;
        }
@@ -351,7 +351,6 @@ static int tegra_gart_probe(struct platform_device *pdev)
        struct gart_device *gart;
        struct resource *res, *res_remap;
        void __iomem *gart_regs;
-       int err;
        struct device *dev = &pdev->dev;
 
        if (gart_handle)
@@ -376,8 +375,7 @@ static int tegra_gart_probe(struct platform_device *pdev)
        gart_regs = devm_ioremap(dev, res->start, resource_size(res));
        if (!gart_regs) {
                dev_err(dev, "failed to remap GART registers\n");
-               err = -ENXIO;
-               goto fail;
+               return -ENXIO;
        }
 
        gart->dev = &pdev->dev;
@@ -391,8 +389,7 @@ static int tegra_gart_probe(struct platform_device *pdev)
        gart->savedata = vmalloc(sizeof(u32) * gart->page_count);
        if (!gart->savedata) {
                dev_err(dev, "failed to allocate context save area\n");
-               err = -ENOMEM;
-               goto fail;
+               return -ENOMEM;
        }
 
        platform_set_drvdata(pdev, gart);
@@ -401,32 +398,20 @@ static int tegra_gart_probe(struct platform_device *pdev)
        gart_handle = gart;
        bus_set_iommu(&platform_bus_type, &gart_iommu_ops);
        return 0;
-
-fail:
-       if (gart_regs)
-               devm_iounmap(dev, gart_regs);
-       if (gart && gart->savedata)
-               vfree(gart->savedata);
-       devm_kfree(dev, gart);
-       return err;
 }
 
 static int tegra_gart_remove(struct platform_device *pdev)
 {
        struct gart_device *gart = platform_get_drvdata(pdev);
-       struct device *dev = gart->dev;
 
        writel(0, gart->regs + GART_CONFIG);
        if (gart->savedata)
                vfree(gart->savedata);
-       if (gart->regs)
-               devm_iounmap(dev, gart->regs);
-       devm_kfree(dev, gart);
        gart_handle = NULL;
        return 0;
 }
 
-const struct dev_pm_ops tegra_gart_pm_ops = {
+static const struct dev_pm_ops tegra_gart_pm_ops = {
        .suspend        = tegra_gart_suspend,
        .resume         = tegra_gart_resume,
 };
index e066560..605b5b4 100644 (file)
@@ -731,7 +731,7 @@ static int smmu_iommu_map(struct iommu_domain *domain, unsigned long iova,
        unsigned long pfn = __phys_to_pfn(pa);
        unsigned long flags;
 
-       dev_dbg(as->smmu->dev, "[%d] %08lx:%08x\n", as->asid, iova, pa);
+       dev_dbg(as->smmu->dev, "[%d] %08lx:%pa\n", as->asid, iova, &pa);
 
        if (!pfn_valid(pfn))
                return -ENOMEM;
@@ -1254,7 +1254,7 @@ static int tegra_smmu_remove(struct platform_device *pdev)
        return 0;
 }
 
-const struct dev_pm_ops tegra_smmu_pm_ops = {
+static const struct dev_pm_ops tegra_smmu_pm_ops = {
        .suspend        = tegra_smmu_suspend,
        .resume         = tegra_smmu_resume,
 };
index b3256ff..d0a1d8a 100644 (file)
@@ -229,7 +229,7 @@ struct lguest_vq_info {
  * make a hypercall.  We hand the physical address of the virtqueue so the Host
  * knows which virtqueue we're talking about.
  */
-static void lg_notify(struct virtqueue *vq)
+static bool lg_notify(struct virtqueue *vq)
 {
        /*
         * We store our virtqueue information in the "priv" pointer of the
@@ -238,6 +238,7 @@ static void lg_notify(struct virtqueue *vq)
        struct lguest_vq_info *lvq = vq->priv;
 
        hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0);
+       return true;
 }
 
 /* An extern declaration inside a C file is bad form.  Don't do it. */
index 5169239..922a1ac 100644 (file)
@@ -157,7 +157,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
         * stack, then the address of this call.  This stack layout happens to
         * exactly match the stack layout created by an interrupt...
         */
-       asm volatile("pushf; lcall *lguest_entry"
+       asm volatile("pushf; lcall *%4"
                     /*
                      * This is how we tell GCC that %eax ("a") and %ebx ("b")
                      * are changed by this routine.  The "=" means output.
@@ -169,7 +169,9 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
                      * physical address of the Guest's top-level page
                      * directory.
                      */
-                    : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
+                    : "0"(pages), 
+                      "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
+                      "m"(lguest_entry)
                     /*
                      * We tell gcc that all these registers could change,
                      * which means we don't have to save and restore them in
index 50ea7ed..81b0fa6 100644 (file)
@@ -950,7 +950,7 @@ static int crypt_convert(struct crypt_config *cc,
                /* async */
                case -EBUSY:
                        wait_for_completion(&ctx->restart);
-                       INIT_COMPLETION(ctx->restart);
+                       reinit_completion(&ctx->restart);
                        /* fall through*/
                case -EINPROGRESS:
                        this_cc->req = NULL;
index f8b9068..7f0e17a 100644 (file)
@@ -293,20 +293,6 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
                do_release_stripe(conf, sh);
 }
 
-static struct llist_node *llist_reverse_order(struct llist_node *head)
-{
-       struct llist_node *new_head = NULL;
-
-       while (head) {
-               struct llist_node *tmp = head;
-               head = head->next;
-               tmp->next = new_head;
-               new_head = tmp;
-       }
-
-       return new_head;
-}
-
 /* should hold conf->device_lock already */
 static int release_stripe_list(struct r5conf *conf)
 {
index 4c11059..2819165 100644 (file)
@@ -422,7 +422,7 @@ static int bcap_start_streaming(struct vb2_queue *vq, unsigned int count)
                return ret;
        }
 
-       INIT_COMPLETION(bcap_dev->comp);
+       reinit_completion(&bcap_dev->comp);
        bcap_dev->stop = false;
        return 0;
 }
index 97c2c18..9cf6731 100644 (file)
@@ -375,7 +375,7 @@ static int wl1273_fm_set_tx_freq(struct wl1273_device *radio, unsigned int freq)
        if (r)
                return r;
 
-       INIT_COMPLETION(radio->busy);
+       reinit_completion(&radio->busy);
 
        /* wait for the FR IRQ */
        r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(2000));
@@ -389,7 +389,7 @@ static int wl1273_fm_set_tx_freq(struct wl1273_device *radio, unsigned int freq)
        if (r)
                return r;
 
-       INIT_COMPLETION(radio->busy);
+       reinit_completion(&radio->busy);
 
        /* wait for the POWER_ENB IRQ */
        r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(1000));
@@ -444,7 +444,7 @@ static int wl1273_fm_set_rx_freq(struct wl1273_device *radio, unsigned int freq)
                goto err;
        }
 
-       INIT_COMPLETION(radio->busy);
+       reinit_completion(&radio->busy);
 
        r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(2000));
        if (!r) {
@@ -805,7 +805,7 @@ static int wl1273_fm_set_seek(struct wl1273_device *radio,
        if (level < SCHAR_MIN || level > SCHAR_MAX)
                return -EINVAL;
 
-       INIT_COMPLETION(radio->busy);
+       reinit_completion(&radio->busy);
        dev_dbg(radio->dev, "%s: BUSY\n", __func__);
 
        r = core->write(core, WL1273_INT_MASK_SET, radio->irq_flags);
@@ -847,7 +847,7 @@ static int wl1273_fm_set_seek(struct wl1273_device *radio,
        if (r)
                goto out;
 
-       INIT_COMPLETION(radio->busy);
+       reinit_completion(&radio->busy);
        dev_dbg(radio->dev, "%s: BUSY\n", __func__);
 
        r = core->write(core, WL1273_TUNER_MODE_SET, TUNER_MODE_AUTO_SEEK);
index 5c57e5b..0bd2500 100644 (file)
@@ -218,7 +218,7 @@ static int si470x_set_chan(struct si470x_device *radio, unsigned short chan)
                goto done;
 
        /* wait till tune operation has completed */
-       INIT_COMPLETION(radio->completion);
+       reinit_completion(&radio->completion);
        retval = wait_for_completion_timeout(&radio->completion,
                        msecs_to_jiffies(tune_timeout));
        if (!retval)
@@ -341,7 +341,7 @@ static int si470x_set_seek(struct si470x_device *radio,
                return retval;
 
        /* wait till tune operation has completed */
-       INIT_COMPLETION(radio->completion);
+       reinit_completion(&radio->completion);
        retval = wait_for_completion_timeout(&radio->completion,
                        msecs_to_jiffies(seek_timeout));
        if (!retval)
index 19632b1..b53626b 100644 (file)
@@ -207,7 +207,7 @@ static int iguanair_send(struct iguanair *ir, unsigned size)
 {
        int rc;
 
-       INIT_COMPLETION(ir->completion);
+       reinit_completion(&ir->completion);
 
        ir->urb_out->transfer_buffer_length = size;
        rc = usb_submit_urb(ir->urb_out, GFP_KERNEL);
index bbf4aea..a0547db 100644 (file)
@@ -253,7 +253,7 @@ void memstick_new_req(struct memstick_host *host)
 {
        if (host->card) {
                host->retries = cmd_retries;
-               INIT_COMPLETION(host->card->mrq_complete);
+               reinit_completion(&host->card->mrq_complete);
                host->request(host);
        }
 }
index 9188ef5..24f2f84 100644 (file)
@@ -401,10 +401,10 @@ again:
                        sizeof(struct ms_status_register)))
                        return 0;
 
-               msb->state = MSB_RP_RECIVE_STATUS_REG;
+               msb->state = MSB_RP_RECEIVE_STATUS_REG;
                return 0;
 
-       case MSB_RP_RECIVE_STATUS_REG:
+       case MSB_RP_RECEIVE_STATUS_REG:
                msb->regs.status = *(struct ms_status_register *)mrq->data;
                msb->state = MSB_RP_SEND_OOB_READ;
                /* fallthrough */
index 96e6375..c75198d 100644 (file)
@@ -223,7 +223,7 @@ enum msb_readpage_states {
        MSB_RP_RECEIVE_INT_REQ_RESULT,
 
        MSB_RP_SEND_READ_STATUS_REG,
-       MSB_RP_RECIVE_STATUS_REG,
+       MSB_RP_RECEIVE_STATUS_REG,
 
        MSB_RP_SEND_OOB_READ,
        MSB_RP_RECEIVE_OOB_READ,
index 1b6e913..31727bf 100644 (file)
@@ -290,7 +290,7 @@ static int r592_transfer_fifo_dma(struct r592_device *dev)
        dbg_verbose("doing dma transfer");
 
        dev->dma_error = 0;
-       INIT_COMPLETION(dev->dma_done);
+       reinit_completion(&dev->dma_done);
 
        /* TODO: hidden assumption about nenth beeing always 1 */
        sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ?
index 914cc9b..8aa42e7 100644 (file)
@@ -493,7 +493,7 @@ static int mic_remove_device(struct mic_device_desc __iomem *d,
                        ioread8(&dc->config_change), ioread8(&d->type), mvdev);
 
                status = ioread8(&d->status);
-               INIT_COMPLETION(mvdev->reset_done);
+               reinit_completion(&mvdev->reset_done);
                unregister_virtio_device(&mvdev->vdev);
                mic_free_card_irq(mvdev->virtio_cookie, mvdev);
                if (status & VIRTIO_CONFIG_S_DRIVER_OK)
index b079c65..7558d91 100644 (file)
@@ -38,7 +38,7 @@ static void mic_reset(struct mic_device *mdev)
 
 #define MIC_RESET_TO (45)
 
-       INIT_COMPLETION(mdev->reset_wait);
+       reinit_completion(&mdev->reset_wait);
        mdev->ops->reset_fw_ready(mdev);
        mdev->ops->reset(mdev);
 
index 83907c7..96853a0 100644 (file)
@@ -218,7 +218,7 @@ static long read_local_version(struct kim_data_s *kim_gdata, char *bts_scr_name)
 
        pr_debug("%s", __func__);
 
-       INIT_COMPLETION(kim_gdata->kim_rcvd);
+       reinit_completion(&kim_gdata->kim_rcvd);
        if (4 != st_int_write(kim_gdata->core_data, read_ver_cmd, 4)) {
                pr_err("kim: couldn't write 4 bytes");
                return -EIO;
@@ -229,7 +229,7 @@ static long read_local_version(struct kim_data_s *kim_gdata, char *bts_scr_name)
                pr_err(" waiting for ver info- timed out ");
                return -ETIMEDOUT;
        }
-       INIT_COMPLETION(kim_gdata->kim_rcvd);
+       reinit_completion(&kim_gdata->kim_rcvd);
        /* the positions 12 & 13 in the response buffer provide with the
         * chip, major & minor numbers
         */
@@ -362,7 +362,7 @@ static long download_firmware(struct kim_data_s *kim_gdata)
                        /* reinit completion before sending for the
                         * relevant wait
                         */
-                       INIT_COMPLETION(kim_gdata->kim_rcvd);
+                       reinit_completion(&kim_gdata->kim_rcvd);
 
                        /*
                         * Free space found in uart buffer, call st_int_write
@@ -398,7 +398,7 @@ static long download_firmware(struct kim_data_s *kim_gdata)
                                release_firmware(kim_gdata->fw_entry);
                                return -ETIMEDOUT;
                        }
-                       INIT_COMPLETION(kim_gdata->kim_rcvd);
+                       reinit_completion(&kim_gdata->kim_rcvd);
                        break;
                case ACTION_DELAY:      /* sleep */
                        pr_info("sleep command in scr");
@@ -474,7 +474,7 @@ long st_kim_start(void *kim_data)
                gpio_set_value(kim_gdata->nshutdown, GPIO_HIGH);
                mdelay(100);
                /* re-initialize the completion */
-               INIT_COMPLETION(kim_gdata->ldisc_installed);
+               reinit_completion(&kim_gdata->ldisc_installed);
                /* send notification to UIM */
                kim_gdata->ldisc_install = 1;
                pr_info("ldisc_install = 1");
@@ -525,7 +525,7 @@ long st_kim_stop(void *kim_data)
                kim_gdata->kim_pdev->dev.platform_data;
        struct tty_struct       *tty = kim_gdata->core_data->tty;
 
-       INIT_COMPLETION(kim_gdata->ldisc_installed);
+       reinit_completion(&kim_gdata->ldisc_installed);
 
        if (tty) {      /* can be called before ldisc is installed */
                /* Flush any pending characters in the driver and discipline. */
index 4edea7f..9dfdb06 100644 (file)
@@ -396,7 +396,7 @@ static void wait_op_done(struct mxc_nand_host *host, int useirq)
 
        if (useirq) {
                if (!host->devtype_data->check_int(host)) {
-                       INIT_COMPLETION(host->op_completion);
+                       reinit_completion(&host->op_completion);
                        irq_control(host, 1);
                        wait_for_completion(&host->op_completion);
                }
index 9dcf02d..325930d 100644 (file)
@@ -181,7 +181,7 @@ static void r852_do_dma(struct r852_device *dev, uint8_t *buf, int do_read)
        /* Set dma direction */
        dev->dma_dir = do_read;
        dev->dma_stage = 1;
-       INIT_COMPLETION(dev->dma_done);
+       reinit_completion(&dev->dma_done);
 
        dbg_verbose("doing dma %s ", do_read ? "read" : "write");
 
index 2362909..6547c84 100644 (file)
@@ -159,7 +159,7 @@ static int omap2_onenand_wait(struct mtd_info *mtd, int state)
                                syscfg = read_reg(c, ONENAND_REG_SYS_CFG1);
                }
 
-               INIT_COMPLETION(c->irq_done);
+               reinit_completion(&c->irq_done);
                if (c->gpio_irq) {
                        result = gpio_get_value(c->gpio_irq);
                        if (result == -1) {
@@ -349,7 +349,7 @@ static int omap3_onenand_read_bufferram(struct mtd_info *mtd, int area,
        omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
                                 dma_dst, 0, 0);
 
-       INIT_COMPLETION(c->dma_done);
+       reinit_completion(&c->dma_done);
        omap_start_dma(c->dma_channel);
 
        timeout = jiffies + msecs_to_jiffies(20);
@@ -420,7 +420,7 @@ static int omap3_onenand_write_bufferram(struct mtd_info *mtd, int area,
        omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
                                 dma_dst, 0, 0);
 
-       INIT_COMPLETION(c->dma_done);
+       reinit_completion(&c->dma_done);
        omap_start_dma(c->dma_channel);
 
        timeout = jiffies + msecs_to_jiffies(20);
@@ -499,7 +499,7 @@ static int omap2_onenand_read_bufferram(struct mtd_info *mtd, int area,
        omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
                                 dma_dst, 0, 0);
 
-       INIT_COMPLETION(c->dma_done);
+       reinit_completion(&c->dma_done);
        omap_start_dma(c->dma_channel);
        wait_for_completion(&c->dma_done);
 
@@ -544,7 +544,7 @@ static int omap2_onenand_write_bufferram(struct mtd_info *mtd, int area,
        omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
                                 dma_dst, 0, 0);
 
-       INIT_COMPLETION(c->dma_done);
+       reinit_completion(&c->dma_done);
        omap_start_dma(c->dma_channel);
        wait_for_completion(&c->dma_done);
 
index b9ed128..9856086 100644 (file)
@@ -686,18 +686,19 @@ static int cfv_probe(struct virtio_device *vdev)
                goto err;
 
        /* Get the CAIF configuration from virtio config space, if available */
-#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
-       ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
-                          &_var, \
-                          FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
-
        if (vdev->config->get) {
-               GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
-               GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
-               GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
-               GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
-               GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
-               GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
+               virtio_cread(vdev, struct virtio_caif_transf_config, headroom,
+                            &cfv->tx_hr);
+               virtio_cread(vdev, struct virtio_caif_transf_config, headroom,
+                            &cfv->rx_hr);
+               virtio_cread(vdev, struct virtio_caif_transf_config, tailroom,
+                            &cfv->tx_tr);
+               virtio_cread(vdev, struct virtio_caif_transf_config, tailroom,
+                            &cfv->rx_tr);
+               virtio_cread(vdev, struct virtio_caif_transf_config, mtu,
+                            &cfv->mtu);
+               virtio_cread(vdev, struct virtio_caif_transf_config, mtu,
+                            &cfv->mru);
        } else {
                cfv->tx_hr = CFV_DEF_HEADROOM;
                cfv->rx_hr = CFV_DEF_HEADROOM;
index 09810dd..a01a6a7 100644 (file)
@@ -3537,7 +3537,7 @@ int qlcnic_83xx_resume(struct qlcnic_adapter *adapter)
 
 void qlcnic_83xx_reinit_mbx_work(struct qlcnic_mailbox *mbx)
 {
-       INIT_COMPLETION(mbx->completion);
+       reinit_completion(&mbx->completion);
        set_bit(QLC_83XX_MBX_READY, &mbx->status);
 }
 
index 6f10b49..2cbe1c2 100644 (file)
@@ -561,7 +561,7 @@ at86rf230_xmit(struct ieee802154_dev *dev, struct sk_buff *skb)
 
        spin_lock_irqsave(&lp->lock, flags);
        lp->is_tx = 1;
-       INIT_COMPLETION(lp->tx_complete);
+       reinit_completion(&lp->tx_complete);
        spin_unlock_irqrestore(&lp->lock, flags);
 
        rc = at86rf230_write_fbuf(lp, skb->data, skb->len);
index 0632d34..c6e46d6 100644 (file)
@@ -343,7 +343,7 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
        if (ret)
                goto err;
 
-       INIT_COMPLETION(devrec->tx_complete);
+       reinit_completion(&devrec->tx_complete);
 
        /* Set TXNTRIG bit of TXNCON to send packet */
        ret = read_short_reg(devrec, REG_TXNCON, &val);
index bf7c734..cdc7c90 100644 (file)
@@ -591,7 +591,8 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
        } while (rq->vq->num_free);
        if (unlikely(rq->num > rq->max))
                rq->max = rq->num;
-       virtqueue_kick(rq->vq);
+       if (unlikely(!virtqueue_kick(rq->vq)))
+               return false;
        return !oom;
 }
 
@@ -797,7 +798,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
        err = xmit_skb(sq, skb);
 
        /* This should not happen! */
-       if (unlikely(err)) {
+       if (unlikely(err) || unlikely(!virtqueue_kick(sq->vq))) {
                dev->stats.tx_fifo_errors++;
                if (net_ratelimit())
                        dev_warn(&dev->dev,
@@ -806,7 +807,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
                kfree_skb(skb);
                return NETDEV_TX_OK;
        }
-       virtqueue_kick(sq->vq);
 
        /* Don't wait up for transmitted skbs to be freed. */
        skb_orphan(skb);
@@ -865,12 +865,14 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
        BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC)
               < 0);
 
-       virtqueue_kick(vi->cvq);
+       if (unlikely(!virtqueue_kick(vi->cvq)))
+               return status == VIRTIO_NET_OK;
 
        /* Spin for a response, the kick causes an ioport write, trapping
         * into the hypervisor, so the request should be handled immediately.
         */
-       while (!virtqueue_get_buf(vi->cvq, &tmp))
+       while (!virtqueue_get_buf(vi->cvq, &tmp) &&
+              !virtqueue_is_broken(vi->cvq))
                cpu_relax();
 
        return status == VIRTIO_NET_OK;
@@ -898,8 +900,13 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
                        return -EINVAL;
                }
        } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
-               vdev->config->set(vdev, offsetof(struct virtio_net_config, mac),
-                                 addr->sa_data, dev->addr_len);
+               unsigned int i;
+
+               /* Naturally, this has an atomicity problem. */
+               for (i = 0; i < dev->addr_len; i++)
+                       virtio_cwrite8(vdev,
+                                      offsetof(struct virtio_net_config, mac) +
+                                      i, addr->sa_data[i]);
        }
 
        eth_commit_mac_addr_change(dev, p);
@@ -1281,9 +1288,8 @@ static void virtnet_config_changed_work(struct work_struct *work)
        if (!vi->config_enable)
                goto done;
 
-       if (virtio_config_val(vi->vdev, VIRTIO_NET_F_STATUS,
-                             offsetof(struct virtio_net_config, status),
-                             &v) < 0)
+       if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
+                                struct virtio_net_config, status, &v) < 0)
                goto done;
 
        if (v & VIRTIO_NET_S_ANNOUNCE) {
@@ -1507,9 +1513,9 @@ static int virtnet_probe(struct virtio_device *vdev)
        u16 max_queue_pairs;
 
        /* Find if host supports multiqueue virtio_net device */
-       err = virtio_config_val(vdev, VIRTIO_NET_F_MQ,
-                               offsetof(struct virtio_net_config,
-                               max_virtqueue_pairs), &max_queue_pairs);
+       err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
+                                  struct virtio_net_config,
+                                  max_virtqueue_pairs, &max_queue_pairs);
 
        /* We need at least 2 queue's */
        if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
@@ -1561,9 +1567,11 @@ static int virtnet_probe(struct virtio_device *vdev)
        dev->vlan_features = dev->features;
 
        /* Configuration may specify what MAC to use.  Otherwise random. */
-       if (virtio_config_val_len(vdev, VIRTIO_NET_F_MAC,
-                                 offsetof(struct virtio_net_config, mac),
-                                 dev->dev_addr, dev->addr_len) < 0)
+       if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
+               virtio_cread_bytes(vdev,
+                                  offsetof(struct virtio_net_config, mac),
+                                  dev->dev_addr, dev->addr_len);
+       else
                eth_hw_addr_random(dev);
 
        /* Set up our device-specific information */
@@ -1704,7 +1712,7 @@ static void virtnet_remove(struct virtio_device *vdev)
        free_netdev(vi->dev);
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int virtnet_freeze(struct virtio_device *vdev)
 {
        struct virtnet_info *vi = vdev->priv;
@@ -1795,7 +1803,7 @@ static struct virtio_driver virtio_net_driver = {
        .probe =        virtnet_probe,
        .remove =       virtnet_remove,
        .config_changed = virtnet_config_changed,
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
        .freeze =       virtnet_freeze,
        .restore =      virtnet_restore,
 #endif
index 3118d75..edae50b 100644 (file)
@@ -534,7 +534,7 @@ int ath10k_htc_wait_target(struct ath10k_htc *htc)
        u16 credit_count;
        u16 credit_size;
 
-       INIT_COMPLETION(htc->ctl_resp);
+       reinit_completion(&htc->ctl_resp);
 
        status = ath10k_hif_start(htc->ar);
        if (status) {
@@ -669,7 +669,7 @@ int ath10k_htc_connect_service(struct ath10k_htc *htc,
        req_msg->flags = __cpu_to_le16(flags);
        req_msg->service_id = __cpu_to_le16(conn_req->service_id);
 
-       INIT_COMPLETION(htc->ctl_resp);
+       reinit_completion(&htc->ctl_resp);
 
        status = ath10k_htc_send(htc, ATH10K_HTC_EP_0, skb);
        if (status) {
index 0b1cc51..97ac8c8 100644 (file)
@@ -92,7 +92,7 @@ static int ath10k_install_key(struct ath10k_vif *arvif,
 
        lockdep_assert_held(&ar->conf_mutex);
 
-       INIT_COMPLETION(ar->install_key_done);
+       reinit_completion(&ar->install_key_done);
 
        ret = ath10k_send_key(arvif, key, cmd, macaddr);
        if (ret)
@@ -438,7 +438,7 @@ static int ath10k_vdev_start(struct ath10k_vif *arvif)
 
        lockdep_assert_held(&ar->conf_mutex);
 
-       INIT_COMPLETION(ar->vdev_setup_done);
+       reinit_completion(&ar->vdev_setup_done);
 
        arg.vdev_id = arvif->vdev_id;
        arg.dtim_period = arvif->dtim_period;
@@ -491,7 +491,7 @@ static int ath10k_vdev_stop(struct ath10k_vif *arvif)
 
        lockdep_assert_held(&ar->conf_mutex);
 
-       INIT_COMPLETION(ar->vdev_setup_done);
+       reinit_completion(&ar->vdev_setup_done);
 
        ret = ath10k_wmi_vdev_stop(ar, arvif->vdev_id);
        if (ret) {
@@ -1666,7 +1666,7 @@ void ath10k_offchan_tx_work(struct work_struct *work)
                }
 
                spin_lock_bh(&ar->data_lock);
-               INIT_COMPLETION(ar->offchan_tx_completed);
+               reinit_completion(&ar->offchan_tx_completed);
                ar->offchan_tx_skb = skb;
                spin_unlock_bh(&ar->data_lock);
 
@@ -2476,8 +2476,8 @@ static int ath10k_hw_scan(struct ieee80211_hw *hw,
                goto exit;
        }
 
-       INIT_COMPLETION(ar->scan.started);
-       INIT_COMPLETION(ar->scan.completed);
+       reinit_completion(&ar->scan.started);
+       reinit_completion(&ar->scan.completed);
        ar->scan.in_progress = true;
        ar->scan.aborting = false;
        ar->scan.is_roc = false;
@@ -2832,9 +2832,9 @@ static int ath10k_remain_on_channel(struct ieee80211_hw *hw,
                goto exit;
        }
 
-       INIT_COMPLETION(ar->scan.started);
-       INIT_COMPLETION(ar->scan.completed);
-       INIT_COMPLETION(ar->scan.on_channel);
+       reinit_completion(&ar->scan.started);
+       reinit_completion(&ar->scan.completed);
+       reinit_completion(&ar->scan.on_channel);
        ar->scan.in_progress = true;
        ar->scan.aborting = false;
        ar->scan.is_roc = true;
index 307bc0d..ca115f3 100644 (file)
@@ -773,7 +773,7 @@ void carl9170_usb_stop(struct ar9170 *ar)
        complete_all(&ar->cmd_wait);
 
        /* This is required to prevent an early completion on _start */
-       INIT_COMPLETION(ar->cmd_wait);
+       reinit_completion(&ar->cmd_wait);
 
        /*
         * Note:
index 0a2844c..fd30cdd 100644 (file)
@@ -250,7 +250,7 @@ int wil_reset(struct wil6210_priv *wil)
 
        /* init after reset */
        wil->pending_connect_cid = -1;
-       INIT_COMPLETION(wil->wmi_ready);
+       reinit_completion(&wil->wmi_ready);
 
        /* TODO: release MAC reset */
        wil6210_enable_irq(wil);
index d7a9745..5b5b952 100644 (file)
@@ -1148,7 +1148,7 @@ static s32 brcmf_p2p_af_searching_channel(struct brcmf_p2p_info *p2p)
 
        pri_vif = p2p->bss_idx[P2PAPI_BSSCFG_PRIMARY].vif;
 
-       INIT_COMPLETION(afx_hdl->act_frm_scan);
+       reinit_completion(&afx_hdl->act_frm_scan);
        set_bit(BRCMF_P2P_STATUS_FINDING_COMMON_CHANNEL, &p2p->status);
        afx_hdl->is_active = true;
        afx_hdl->peer_chan = P2P_INVALID_CHANNEL;
@@ -1501,7 +1501,7 @@ static s32 brcmf_p2p_tx_action_frame(struct brcmf_p2p_info *p2p,
 
        brcmf_dbg(TRACE, "Enter\n");
 
-       INIT_COMPLETION(p2p->send_af_done);
+       reinit_completion(&p2p->send_af_done);
        clear_bit(BRCMF_P2P_STATUS_ACTION_TX_COMPLETED, &p2p->status);
        clear_bit(BRCMF_P2P_STATUS_ACTION_TX_NOACK, &p2p->status);
 
index ae15228..a8cc736 100644 (file)
@@ -446,7 +446,7 @@ static void rt2800mmio_txstatus_interrupt(struct rt2x00_dev *rt2x00dev)
                if (!rt2x00_get_field32(status, TX_STA_FIFO_VALID))
                        break;
 
-               if (!kfifo_put(&rt2x00dev->txstatus_fifo, &status)) {
+               if (!kfifo_put(&rt2x00dev->txstatus_fifo, status)) {
                        rt2x00_warn(rt2x00dev, "TX status FIFO overrun, drop tx status report\n");
                        break;
                }
index 997df03..a81ceb6 100644 (file)
@@ -164,7 +164,7 @@ static bool rt2800usb_tx_sta_fifo_read_completed(struct rt2x00_dev *rt2x00dev,
 
        valid = rt2x00_get_field32(tx_status, TX_STA_FIFO_VALID);
        if (valid) {
-               if (!kfifo_put(&rt2x00dev->txstatus_fifo, &tx_status))
+               if (!kfifo_put(&rt2x00dev->txstatus_fifo, tx_status))
                        rt2x00_warn(rt2x00dev, "TX status FIFO overrun\n");
 
                queue_work(rt2x00dev->workqueue, &rt2x00dev->txdone_work);
index 7ef0b4a..84d94f5 100644 (file)
@@ -1619,7 +1619,7 @@ static void prepare_read_regs_int(struct zd_usb *usb,
        atomic_set(&intr->read_regs_enabled, 1);
        intr->read_regs.req = req;
        intr->read_regs.req_count = count;
-       INIT_COMPLETION(intr->read_regs.completion);
+       reinit_completion(&intr->read_regs.completion);
        spin_unlock_irq(&intr->lock);
 }
 
index d471627..c864f82 100644 (file)
@@ -1331,7 +1331,7 @@ static unsigned int parport_ip32_fwp_wait_interrupt(struct parport *p)
                        break;
 
                /* Initialize mutex used to take interrupts into account */
-               INIT_COMPLETION(priv->irq_complete);
+               reinit_completion(&priv->irq_complete);
 
                /* Enable serviceIntr */
                parport_ip32_frob_econtrol(p, ECR_SERVINTR, 0);
@@ -1446,7 +1446,7 @@ static size_t parport_ip32_fifo_write_block_dma(struct parport *p,
        priv->irq_mode = PARPORT_IP32_IRQ_HERE;
 
        parport_ip32_dma_start(DMA_TO_DEVICE, (void *)buf, len);
-       INIT_COMPLETION(priv->irq_complete);
+       reinit_completion(&priv->irq_complete);
        parport_ip32_frob_econtrol(p, ECR_DMAEN | ECR_SERVINTR, ECR_DMAEN);
 
        nfault_timeout = min((unsigned long)physport->cad->timeout,
index 85ca36f..6b3a958 100644 (file)
@@ -574,7 +574,7 @@ void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
        };
 
        spin_lock_irqsave(&aer_recover_ring_lock, flags);
-       if (kfifo_put(&aer_recover_ring, &entry))
+       if (kfifo_put(&aer_recover_ring, entry))
                schedule_work(&aer_recover_work);
        else
                pr_err("AER recover: Buffer overflow when recovering AER for %04x:%02x:%02x:%x\n",
index 8eea2ef..605a9be 100644 (file)
@@ -289,7 +289,7 @@ static int gmux_switchto(enum vga_switcheroo_client_id id)
 static int gmux_set_discrete_state(struct apple_gmux_data *gmux_data,
                                   enum vga_switcheroo_state state)
 {
-       INIT_COMPLETION(gmux_data->powerchange_done);
+       reinit_completion(&gmux_data->powerchange_done);
 
        if (state == VGA_SWITCHEROO_ON) {
                gmux_write8(gmux_data, GMUX_PORT_DISCRETE_POWER, 1);
index 7549707..3cb4178 100644 (file)
@@ -574,8 +574,8 @@ int ab8500_fg_inst_curr_start(struct ab8500_fg *di)
        }
 
        /* Return and WFI */
-       INIT_COMPLETION(di->ab8500_fg_started);
-       INIT_COMPLETION(di->ab8500_fg_complete);
+       reinit_completion(&di->ab8500_fg_started);
+       reinit_completion(&di->ab8500_fg_complete);
        enable_irq(di->irq);
 
        /* Note: cc_lock is still locked */
index d9686aa..6c8931d 100644 (file)
@@ -73,7 +73,7 @@ static long jz_battery_read_voltage(struct jz_battery *battery)
 
        mutex_lock(&battery->lock);
 
-       INIT_COMPLETION(battery->read_completion);
+       reinit_completion(&battery->read_completion);
 
        enable_irq(battery->irq);
        battery->cell->enable(battery->pdev);
index b09c75c..a34b506 100644 (file)
@@ -30,7 +30,7 @@
 #include "remoteproc_internal.h"
 
 /* kick the remote processor, and let it know which virtqueue to poke at */
-static void rproc_virtio_notify(struct virtqueue *vq)
+static bool rproc_virtio_notify(struct virtqueue *vq)
 {
        struct rproc_vring *rvring = vq->priv;
        struct rproc *rproc = rvring->rvdev->rproc;
@@ -39,6 +39,7 @@ static void rproc_virtio_notify(struct virtqueue *vq)
        dev_dbg(&rproc->dev, "kicking vq index: %d\n", notifyid);
 
        rproc->ops->kick(rproc, notifyid);
+       return true;
 }
 
 /**
index 45560ff..965a9da 100644 (file)
@@ -209,7 +209,7 @@ static int hid_rtc_read_time(struct device *dev, struct rtc_time *tm)
                platform_get_drvdata(to_platform_device(dev));
        int ret;
 
-       INIT_COMPLETION(time_state->comp_last_time);
+       reinit_completion(&time_state->comp_last_time);
        /* get a report with all values through requesting one value */
        sensor_hub_input_attr_get_raw_value(time_state->common_attributes.hsdev,
                        HID_USAGE_SENSOR_TIME, hid_time_addresses[0],
@@ -236,7 +236,7 @@ static const struct rtc_class_ops hid_time_rtc_ops = {
 static int hid_time_probe(struct platform_device *pdev)
 {
        int ret = 0;
-       struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data;
+       struct hid_sensor_hub_device *hsdev = dev_get_platdata(&pdev->dev);
        struct hid_time_state *time_state = devm_kzalloc(&pdev->dev,
                sizeof(struct hid_time_state), GFP_KERNEL);
 
@@ -281,11 +281,18 @@ static int hid_time_probe(struct platform_device *pdev)
                goto err_open;
        }
 
+       /*
+        * Enable HID input processing early in order to be able to read the
+        * clock already in devm_rtc_device_register().
+        */
+       hid_device_io_start(hsdev->hdev);
+
        time_state->rtc = devm_rtc_device_register(&pdev->dev,
                                        "hid-sensor-time", &hid_time_rtc_ops,
                                        THIS_MODULE);
 
        if (IS_ERR_OR_NULL(time_state->rtc)) {
+               hid_device_io_stop(hsdev->hdev);
                ret = time_state->rtc ? PTR_ERR(time_state->rtc) : -ENODEV;
                time_state->rtc = NULL;
                dev_err(&pdev->dev, "rtc device register failed!\n");
@@ -303,7 +310,7 @@ err_open:
 
 static int hid_time_remove(struct platform_device *pdev)
 {
-       struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data;
+       struct hid_sensor_hub_device *hsdev = dev_get_platdata(&pdev->dev);
 
        sensor_hub_device_close(hsdev);
        sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TIME);
index af2166f..1abd0db 100644 (file)
@@ -166,11 +166,15 @@ static void kvm_reset(struct virtio_device *vdev)
  * make a hypercall.  We hand the address  of the virtqueue so the Host
  * knows which virtqueue we're talking about.
  */
-static void kvm_notify(struct virtqueue *vq)
+static bool kvm_notify(struct virtqueue *vq)
 {
+       long rc;
        struct kvm_vqconfig *config = vq->priv;
 
-       kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, config->address);
+       rc = kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, config->address);
+       if (rc < 0)
+               return false;
+       return true;
 }
 
 /*
index 779dc51..d629717 100644 (file)
@@ -162,7 +162,7 @@ static inline long do_kvm_notify(struct subchannel_id schid,
        return __rc;
 }
 
-static void virtio_ccw_kvm_notify(struct virtqueue *vq)
+static bool virtio_ccw_kvm_notify(struct virtqueue *vq)
 {
        struct virtio_ccw_vq_info *info = vq->priv;
        struct virtio_ccw_device *vcdev;
@@ -171,6 +171,9 @@ static void virtio_ccw_kvm_notify(struct virtqueue *vq)
        vcdev = to_vc_device(info->vq->vdev);
        ccw_device_get_schid(vcdev->cdev, &schid);
        info->cookie = do_kvm_notify(schid, vq->index, info->cookie);
+       if (info->cookie < 0)
+               return false;
+       return true;
 }
 
 static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
index 74b88ef..c3173dc 100644 (file)
@@ -224,6 +224,9 @@ static void virtscsi_vq_done(struct virtio_scsi *vscsi,
                virtqueue_disable_cb(vq);
                while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
                        fn(vscsi, buf);
+
+               if (unlikely(virtqueue_is_broken(vq)))
+                       break;
        } while (!virtqueue_enable_cb(vq));
        spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
 }
@@ -710,19 +713,15 @@ static struct scsi_host_template virtscsi_host_template_multi = {
 #define virtscsi_config_get(vdev, fld) \
        ({ \
                typeof(((struct virtio_scsi_config *)0)->fld) __val; \
-               vdev->config->get(vdev, \
-                                 offsetof(struct virtio_scsi_config, fld), \
-                                 &__val, sizeof(__val)); \
+               virtio_cread(vdev, struct virtio_scsi_config, fld, &__val); \
                __val; \
        })
 
 #define virtscsi_config_set(vdev, fld, val) \
-       (void)({ \
+       do { \
                typeof(((struct virtio_scsi_config *)0)->fld) __val = (val); \
-               vdev->config->set(vdev, \
-                                 offsetof(struct virtio_scsi_config, fld), \
-                                 &__val, sizeof(__val)); \
-       })
+               virtio_cwrite(vdev, struct virtio_scsi_config, fld, &__val); \
+       } while(0)
 
 static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
 {
@@ -954,7 +953,7 @@ static void virtscsi_remove(struct virtio_device *vdev)
        scsi_host_put(shost);
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int virtscsi_freeze(struct virtio_device *vdev)
 {
        virtscsi_remove_vqs(vdev);
@@ -988,7 +987,7 @@ static struct virtio_driver virtio_scsi_driver = {
        .id_table = id_table,
        .probe = virtscsi_probe,
        .scan = virtscsi_scan,
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
        .freeze = virtscsi_freeze,
        .restore = virtscsi_restore,
 #endif
index 4c33214..3ed666f 100644 (file)
@@ -217,7 +217,7 @@ static int bcm2835_spi_start_transfer(struct spi_device *spi,
                cs |= spi->chip_select;
        }
 
-       INIT_COMPLETION(bs->done);
+       reinit_completion(&bs->done);
        bs->tx_buf = tfr->tx_buf;
        bs->rx_buf = tfr->rx_buf;
        bs->len = tfr->len;
index e2a5a42..6f03d7e 100644 (file)
@@ -105,7 +105,7 @@ static int spi_clps711x_transfer_one_message(struct spi_master *master,
 
                gpio_set_value(cs, !!(msg->spi->mode & SPI_CS_HIGH));
 
-               INIT_COMPLETION(hw->done);
+               reinit_completion(&hw->done);
 
                hw->count = 0;
                hw->len = xfer->len;
index dd72445..50b2d88 100644 (file)
@@ -554,7 +554,7 @@ static int davinci_spi_bufs(struct spi_device *spi, struct spi_transfer *t)
        clear_io_bits(dspi->base + SPIGCR1, SPIGCR1_POWERDOWN_MASK);
        set_io_bits(dspi->base + SPIGCR1, SPIGCR1_SPIENA_MASK);
 
-       INIT_COMPLETION(dspi->done);
+       reinit_completion(&dspi->done);
 
        if (spicfg->io_type == SPI_IO_TYPE_INTR)
                set_io_bits(dspi->base + SPIINT, SPIINT_MASKINT);
index 32200d4..80d8f40 100644 (file)
@@ -232,7 +232,7 @@ static int fsl_espi_bufs(struct spi_device *spi, struct spi_transfer *t)
        mpc8xxx_spi->tx = t->tx_buf;
        mpc8xxx_spi->rx = t->rx_buf;
 
-       INIT_COMPLETION(mpc8xxx_spi->done);
+       reinit_completion(&mpc8xxx_spi->done);
 
        /* Set SPCOM[CS] and SPCOM[TRANLEN] field */
        if ((t->len - 1) > SPCOM_TRANLEN_MAX) {
index 2129fcd..119f7af 100644 (file)
@@ -339,7 +339,7 @@ static int fsl_spi_bufs(struct spi_device *spi, struct spi_transfer *t,
        mpc8xxx_spi->tx = t->tx_buf;
        mpc8xxx_spi->rx = t->rx_buf;
 
-       INIT_COMPLETION(mpc8xxx_spi->done);
+       reinit_completion(&mpc8xxx_spi->done);
 
        if (mpc8xxx_spi->flags & SPI_CPM_MODE)
                ret = fsl_spi_cpm_bufs(mpc8xxx_spi, t, is_dma_mapped);
index 58d5ee0..9602bbd 100644 (file)
@@ -167,7 +167,7 @@ static int mpc512x_psc_spi_transfer_rxtx(struct spi_device *spi,
                        }
 
                        /* have the ISR trigger when the TX FIFO is empty */
-                       INIT_COMPLETION(mps->txisrdone);
+                       reinit_completion(&mps->txisrdone);
                        out_be32(&fifo->txisr, MPC512x_PSC_FIFO_EMPTY);
                        out_be32(&fifo->tximr, MPC512x_PSC_FIFO_EMPTY);
                        wait_for_completion(&mps->txisrdone);
index de33305..73afb56 100644 (file)
@@ -202,7 +202,7 @@ static int mxs_spi_txrx_dma(struct mxs_spi *spi,
        if (!dma_xfer)
                return -ENOMEM;
 
-       INIT_COMPLETION(spi->c);
+       reinit_completion(&spi->c);
 
        /* Chip select was already programmed into CTRL0 */
        ctrl0 = readl(ssp->base + HW_SSP_CTRL0);
index 9e2020d..4c4b0a1 100644 (file)
@@ -890,7 +890,7 @@ static int s3c64xx_spi_transfer_one(struct spi_master *master,
        unsigned long flags;
        int use_dma;
 
-       INIT_COMPLETION(sdd->xfer_completion);
+               reinit_completion(&sdd->xfer_completion);
 
        /* Only BPW and Speed may change across transfers */
        bpw = xfer->bits_per_word;
index 2a95435..c74298c 100644 (file)
@@ -465,7 +465,7 @@ static int sh_msiof_spi_txrx_once(struct sh_msiof_spi_priv *p,
        ret = ret ? ret : sh_msiof_modify_ctr_wait(p, 0, CTR_TXE);
 
        /* start by setting frame bit */
-       INIT_COMPLETION(p->done);
+       reinit_completion(&p->done);
        ret = ret ? ret : sh_msiof_modify_ctr_wait(p, 0, CTR_TFSE);
        if (ret) {
                dev_err(&p->pdev->dev, "failed to start hardware\n");
index 592b4af..ed5e501 100644 (file)
@@ -305,8 +305,8 @@ static int spi_sirfsoc_transfer(struct spi_device *spi, struct spi_transfer *t)
        sspi->tx = t->tx_buf ? t->tx_buf : sspi->dummypage;
        sspi->rx = t->rx_buf ? t->rx_buf : sspi->dummypage;
        sspi->left_tx_word = sspi->left_rx_word = t->len / sspi->word_width;
-       INIT_COMPLETION(sspi->rx_done);
-       INIT_COMPLETION(sspi->tx_done);
+       reinit_completion(&sspi->rx_done);
+       reinit_completion(&sspi->tx_done);
 
        writel(SIRFSOC_SPI_INT_MASK_ALL, sspi->base + SIRFSOC_SPI_INT_STATUS);
 
index 9146bb3..aaecfb3 100644 (file)
@@ -451,7 +451,7 @@ static void tegra_spi_dma_complete(void *args)
 
 static int tegra_spi_start_tx_dma(struct tegra_spi_data *tspi, int len)
 {
-       INIT_COMPLETION(tspi->tx_dma_complete);
+       reinit_completion(&tspi->tx_dma_complete);
        tspi->tx_dma_desc = dmaengine_prep_slave_single(tspi->tx_dma_chan,
                                tspi->tx_dma_phys, len, DMA_MEM_TO_DEV,
                                DMA_PREP_INTERRUPT |  DMA_CTRL_ACK);
@@ -470,7 +470,7 @@ static int tegra_spi_start_tx_dma(struct tegra_spi_data *tspi, int len)
 
 static int tegra_spi_start_rx_dma(struct tegra_spi_data *tspi, int len)
 {
-       INIT_COMPLETION(tspi->rx_dma_complete);
+       reinit_completion(&tspi->rx_dma_complete);
        tspi->rx_dma_desc = dmaengine_prep_slave_single(tspi->rx_dma_chan,
                                tspi->rx_dma_phys, len, DMA_DEV_TO_MEM,
                                DMA_PREP_INTERRUPT |  DMA_CTRL_ACK);
@@ -844,7 +844,7 @@ static int tegra_spi_transfer_one_message(struct spi_master *master,
        list_for_each_entry(xfer, &msg->transfers, transfer_list) {
                unsigned long cmd1;
 
-               INIT_COMPLETION(tspi->xfer_completion);
+               reinit_completion(&tspi->xfer_completion);
 
                cmd1 = tegra_spi_setup_transfer_one(spi, xfer, is_first_msg);
 
index 79be8ce..4dc8e81 100644 (file)
@@ -339,7 +339,7 @@ static int tegra_sflash_transfer_one_message(struct spi_master *master,
        msg->actual_length = 0;
        single_xfer = list_is_singular(&msg->transfers);
        list_for_each_entry(xfer, &msg->transfers, transfer_list) {
-               INIT_COMPLETION(tsd->xfer_completion);
+               reinit_completion(&tsd->xfer_completion);
                ret = tegra_sflash_start_transfer_one(spi, xfer,
                                        is_first_msg, single_xfer);
                if (ret < 0) {
index af0a678..e66715b 100644 (file)
@@ -462,7 +462,7 @@ static void tegra_slink_dma_complete(void *args)
 
 static int tegra_slink_start_tx_dma(struct tegra_slink_data *tspi, int len)
 {
-       INIT_COMPLETION(tspi->tx_dma_complete);
+       reinit_completion(&tspi->tx_dma_complete);
        tspi->tx_dma_desc = dmaengine_prep_slave_single(tspi->tx_dma_chan,
                                tspi->tx_dma_phys, len, DMA_MEM_TO_DEV,
                                DMA_PREP_INTERRUPT |  DMA_CTRL_ACK);
@@ -481,7 +481,7 @@ static int tegra_slink_start_tx_dma(struct tegra_slink_data *tspi, int len)
 
 static int tegra_slink_start_rx_dma(struct tegra_slink_data *tspi, int len)
 {
-       INIT_COMPLETION(tspi->rx_dma_complete);
+       reinit_completion(&tspi->rx_dma_complete);
        tspi->rx_dma_desc = dmaengine_prep_slave_single(tspi->rx_dma_chan,
                                tspi->rx_dma_phys, len, DMA_DEV_TO_MEM,
                                DMA_PREP_INTERRUPT |  DMA_CTRL_ACK);
@@ -836,7 +836,7 @@ static int tegra_slink_transfer_one(struct spi_master *master,
        struct tegra_slink_data *tspi = spi_master_get_devdata(master);
        int ret;
 
-       INIT_COMPLETION(tspi->xfer_completion);
+       reinit_completion(&tspi->xfer_completion);
        ret = tegra_slink_start_transfer_one(spi, xfer);
        if (ret < 0) {
                dev_err(tspi->dev,
index ec3a83f..6d4ce46 100644 (file)
@@ -258,7 +258,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
        xspi->tx_ptr = t->tx_buf;
        xspi->rx_ptr = t->rx_buf;
        xspi->remaining_bytes = t->len;
-       INIT_COMPLETION(xspi->done);
+       reinit_completion(&xspi->done);
 
 
        /* Enable the transmit empty interrupt, which we use to determine
index 927998a..8d85ddc 100644 (file)
@@ -571,7 +571,7 @@ static int spi_transfer_one_message(struct spi_master *master,
        list_for_each_entry(xfer, &msg->transfers, transfer_list) {
                trace_spi_transfer_start(msg, xfer);
 
-               INIT_COMPLETION(master->xfer_completion);
+               reinit_completion(&master->xfer_completion);
 
                ret = master->transfer_one(master, msg->spi, xfer);
                if (ret < 0) {
index aeae76b..e2dd783 100644 (file)
@@ -783,7 +783,7 @@ static int mxs_lradc_read_raw(struct iio_dev *iio_dev,
        if (!ret)
                return -EBUSY;
 
-       INIT_COMPLETION(lradc->completion);
+       reinit_completion(&lradc->completion);
 
        /*
         * No buffered operation in progress, map the channel and trigger it.
index 3335941..7f2f247 100644 (file)
@@ -87,7 +87,7 @@ int solo_p2m_dma_desc(struct solo_dev *solo_dev,
        if (mutex_lock_interruptible(&p2m_dev->mutex))
                return -EINTR;
 
-       INIT_COMPLETION(p2m_dev->completion);
+       reinit_completion(&p2m_dev->completion);
        p2m_dev->error = 0;
 
        if (desc_cnt > 1 && solo_dev->type != SOLO_DEV_6110 && desc_mode) {
index 7bb550a..743ff09 100644 (file)
@@ -72,7 +72,7 @@ int sync_wait_on_multiple_events(struct sync_object **events,
        spin_lock_bh(&sync_lock);
        for (i = 0; i < count; i++) {
                if (completion_done(&events[i]->comp)) {
-                       INIT_COMPLETION(events[i]->comp);
+                       reinit_completion(&events[i]->comp);
                        *index = i;
                        spin_unlock_bh(&sync_lock);
                        status = 0;
@@ -92,7 +92,7 @@ int sync_wait_on_multiple_events(struct sync_object **events,
        spin_lock_bh(&sync_lock);
        for (i = 0; i < count; i++) {
                if (completion_done(&events[i]->comp)) {
-                       INIT_COMPLETION(events[i]->comp);
+                       reinit_completion(&events[i]->comp);
                        *index = i;
                        status = 0;
                }
index 58a0d5c..fc19b97 100644 (file)
@@ -59,7 +59,7 @@ static inline void sync_init_event(struct sync_object *event)
 
 static inline void sync_reset_event(struct sync_object *event)
 {
-       INIT_COMPLETION(event->comp);
+       reinit_completion(&event->comp);
        event->multi_comp = NULL;
 }
 
index 6d04eb4..1aa4a3f 100644 (file)
@@ -332,7 +332,7 @@ static void bridge_recover(struct work_struct *work)
        struct dev_object *dev;
        struct cfg_devnode *dev_node;
        if (atomic_read(&bridge_cref)) {
-               INIT_COMPLETION(bridge_comp);
+               reinit_completion(&bridge_comp);
                while (!wait_for_completion_timeout(&bridge_comp,
                                                msecs_to_jiffies(REC_TIMEOUT)))
                        pr_info("%s:%d handle(s) still opened\n",
@@ -348,7 +348,7 @@ static void bridge_recover(struct work_struct *work)
 
 void bridge_recover_schedule(void)
 {
-       INIT_COMPLETION(bridge_open_comp);
+       reinit_completion(&bridge_open_comp);
        recover = true;
        queue_work(bridge_rec_queue, &bridge_recovery_work);
 }
@@ -389,7 +389,7 @@ static int omap3_bridge_startup(struct platform_device *pdev)
 #ifdef CONFIG_TIDSPBRIDGE_RECOVERY
        bridge_rec_queue = create_workqueue("bridge_rec_queue");
        INIT_WORK(&bridge_recovery_work, bridge_recover);
-       INIT_COMPLETION(bridge_comp);
+       reinit_completion(&bridge_comp);
 #endif
 
 #ifdef CONFIG_PM
index c193af6..636c9ba 100644 (file)
@@ -183,7 +183,7 @@ static int dom0_write_console(uint32_t vtermno, const char *str, int len)
 {
        int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
        if (rc < 0)
-               return 0;
+               return rc;
 
        return len;
 }
@@ -642,7 +642,22 @@ struct console xenboot_console = {
 
 void xen_raw_console_write(const char *str)
 {
-       dom0_write_console(0, str, strlen(str));
+       ssize_t len = strlen(str);
+       int rc = 0;
+
+       if (xen_domain()) {
+               rc = dom0_write_console(0, str, len);
+#ifdef CONFIG_X86
+               if (rc == -ENOSYS && xen_hvm_domain())
+                       goto outb_print;
+
+       } else if (xen_cpuid_base()) {
+               int i;
+outb_print:
+               for (i = 0; i < len; i++)
+                       outb(str[i], 0xe9);
+#endif
+       }
 }
 
 void xen_raw_printk(const char *fmt, ...)
index 0e88862..7332e2c 100644 (file)
@@ -495,7 +495,7 @@ static int dashtty_write(struct tty_struct *tty, const unsigned char *buf,
        count = dport->xmit_cnt;
        /* xmit buffer no longer empty? */
        if (count)
-               INIT_COMPLETION(dport->xmit_empty);
+               reinit_completion(&dport->xmit_empty);
        mutex_unlock(&dport->xmit_lock);
 
        if (total) {
index aa49162..892cc96 100644 (file)
@@ -344,7 +344,7 @@ void c67x00_endpoint_disable(struct usb_hcd *hcd, struct usb_host_endpoint *ep)
                /* it could happen that we reinitialize this completion, while
                 * somebody was waiting for that completion.  The timeout and
                 * while loop handle such cases, but this might be improved */
-               INIT_COMPLETION(c67x00->endpoint_disable);
+               reinit_completion(&c67x00->endpoint_disable);
                c67x00_sched_kick(c67x00);
                wait_for_completion_timeout(&c67x00->endpoint_disable, 1 * HZ);
 
index 44cf775..774e8b8 100644 (file)
@@ -373,7 +373,7 @@ static int __ffs_ep0_queue_wait(struct ffs_data *ffs, char *data, size_t len)
        if (req->buf == NULL)
                req->buf = (void *)0xDEADBABE;
 
-       INIT_COMPLETION(ffs->ep0req_completion);
+       reinit_completion(&ffs->ep0req_completion);
 
        ret = usb_ep_queue(ffs->gadget->ep0, req, GFP_ATOMIC);
        if (unlikely(ret < 0))
index 84657e0..439c951 100644 (file)
@@ -455,7 +455,7 @@ static int parport_prologue(struct parport *pp)
                return -1;
        }
        mos_parport->msg_pending = true;   /* synch usb call pending */
-       INIT_COMPLETION(mos_parport->syncmsg_compl);
+       reinit_completion(&mos_parport->syncmsg_compl);
        spin_unlock(&release_lock);
 
        mutex_lock(&mos_parport->serial->disc_mutex);
index 7eed957..85edabf 100644 (file)
@@ -220,7 +220,7 @@ int exynos_mipi_dsi_wr_data(struct mipi_dsim_device *dsim, unsigned int data_id,
        case MIPI_DSI_DCS_LONG_WRITE:
        {
                unsigned int size, payload = 0;
-               INIT_COMPLETION(dsim_wr_comp);
+               reinit_completion(&dsim_wr_comp);
 
                size = data_size * 4;
 
@@ -356,7 +356,7 @@ int exynos_mipi_dsi_rd_data(struct mipi_dsim_device *dsim, unsigned int data_id,
        msleep(20);
 
        mutex_lock(&dsim->lock);
-       INIT_COMPLETION(dsim_rd_comp);
+       reinit_completion(&dsim_rd_comp);
        exynos_mipi_dsi_rd_tx_header(dsim,
                MIPI_DSI_SET_MAXIMUM_RETURN_PACKET_SIZE, req_size);
 
index 798ef20..d5c936c 100644 (file)
@@ -69,7 +69,7 @@ static int tpd_connect(struct omap_dss_device *dssdev,
        dst->src = dssdev;
        dssdev->dst = dst;
 
-       INIT_COMPLETION(ddata->hpd_completion);
+       reinit_completion(&ddata->hpd_completion);
 
        gpio_set_value_cansleep(ddata->ct_cp_hpd_gpio, 1);
        /* DC-DC converter needs at max 300us to get to 90% of 5V */
index 1f572c0..c444654 100644 (file)
@@ -275,9 +275,8 @@ static inline s64 towards_target(struct virtio_balloon *vb)
        __le32 v;
        s64 target;
 
-       vb->vdev->config->get(vb->vdev,
-                             offsetof(struct virtio_balloon_config, num_pages),
-                             &v, sizeof(v));
+       virtio_cread(vb->vdev, struct virtio_balloon_config, num_pages, &v);
+
        target = le32_to_cpu(v);
        return target - vb->num_pages;
 }
@@ -286,9 +285,8 @@ static void update_balloon_size(struct virtio_balloon *vb)
 {
        __le32 actual = cpu_to_le32(vb->num_pages);
 
-       vb->vdev->config->set(vb->vdev,
-                             offsetof(struct virtio_balloon_config, actual),
-                             &actual, sizeof(actual));
+       virtio_cwrite(vb->vdev, struct virtio_balloon_config, num_pages,
+                     &actual);
 }
 
 static int balloon(void *_vballoon)
@@ -513,7 +511,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
        kfree(vb);
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int virtballoon_freeze(struct virtio_device *vdev)
 {
        struct virtio_balloon *vb = vdev->priv;
@@ -556,7 +554,7 @@ static struct virtio_driver virtio_balloon_driver = {
        .probe =        virtballoon_probe,
        .remove =       virtballoon_remove,
        .config_changed = virtballoon_changed,
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
        .freeze =       virtballoon_freeze,
        .restore =      virtballoon_restore,
 #endif
index 1ba0d68..c600ccf 100644 (file)
@@ -219,13 +219,14 @@ static void vm_reset(struct virtio_device *vdev)
 /* Transport interface */
 
 /* the notify function used when creating a virt queue */
-static void vm_notify(struct virtqueue *vq)
+static bool vm_notify(struct virtqueue *vq)
 {
        struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
 
        /* We write the queue's selector into the notification register to
         * signal the other end */
        writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+       return true;
 }
 
 /* Notify all virtqueues on an interrupt. */
@@ -470,7 +471,7 @@ static int virtio_mmio_probe(struct platform_device *pdev)
 
        /* Check magic value */
        magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE);
-       if (memcmp(&magic, "virt", 4) != 0) {
+       if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) {
                dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic);
                return -ENODEV;
        }
index 98917fc..a37c699 100644 (file)
@@ -197,13 +197,14 @@ static void vp_reset(struct virtio_device *vdev)
 }
 
 /* the notify function used when creating a virt queue */
-static void vp_notify(struct virtqueue *vq)
+static bool vp_notify(struct virtqueue *vq)
 {
        struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 
        /* we write the queue's selector into the notification register to
         * signal the other end */
        iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+       return true;
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
index 6b4a4db..28b5338 100644 (file)
@@ -81,7 +81,7 @@ struct vring_virtqueue
        u16 last_used_idx;
 
        /* How to notify other side. FIXME: commonalize hcalls! */
-       void (*notify)(struct virtqueue *vq);
+       bool (*notify)(struct virtqueue *vq);
 
 #ifdef DEBUG
        /* They're supposed to lock for us. */
@@ -173,6 +173,8 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
        head = vq->free_head;
        vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
        vq->vring.desc[head].addr = virt_to_phys(desc);
+       /* kmemleak gives a false positive, as it's hidden by virt_to_phys */
+       kmemleak_ignore(desc);
        vq->vring.desc[head].len = i * sizeof(struct vring_desc);
 
        /* Update free pointer */
@@ -428,13 +430,22 @@ EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
  * @vq: the struct virtqueue
  *
  * This does not need to be serialized.
+ *
+ * Returns false if host notify failed or queue is broken, otherwise true.
  */
-void virtqueue_notify(struct virtqueue *_vq)
+bool virtqueue_notify(struct virtqueue *_vq)
 {
        struct vring_virtqueue *vq = to_vvq(_vq);
 
+       if (unlikely(vq->broken))
+               return false;
+
        /* Prod other side to tell it about changes. */
-       vq->notify(_vq);
+       if (!vq->notify(_vq)) {
+               vq->broken = true;
+               return false;
+       }
+       return true;
 }
 EXPORT_SYMBOL_GPL(virtqueue_notify);
 
@@ -447,11 +458,14 @@ EXPORT_SYMBOL_GPL(virtqueue_notify);
  *
  * Caller must ensure we don't call this with other virtqueue
  * operations at the same time (except where noted).
+ *
+ * Returns false if kick failed, otherwise true.
  */
-void virtqueue_kick(struct virtqueue *vq)
+bool virtqueue_kick(struct virtqueue *vq)
 {
        if (virtqueue_kick_prepare(vq))
-               virtqueue_notify(vq);
+               return virtqueue_notify(vq);
+       return true;
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick);
 
@@ -742,7 +756,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
                                      struct virtio_device *vdev,
                                      bool weak_barriers,
                                      void *pages,
-                                     void (*notify)(struct virtqueue *),
+                                     bool (*notify)(struct virtqueue *),
                                      void (*callback)(struct virtqueue *),
                                      const char *name)
 {
@@ -837,4 +851,12 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
 
+bool virtqueue_is_broken(struct virtqueue *_vq)
+{
+       struct vring_virtqueue *vq = to_vvq(_vq);
+
+       return vq->broken;
+}
+EXPORT_SYMBOL_GPL(virtqueue_is_broken);
+
 MODULE_LICENSE("GPL");
index 264ad1c..e36b18b 100644 (file)
@@ -56,7 +56,7 @@ MODULE_DEVICE_TABLE(of, w1_gpio_dt_ids);
 
 static int w1_gpio_probe_dt(struct platform_device *pdev)
 {
-       struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
+       struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev);
        struct device_node *np = pdev->dev.of_node;
        int gpio;
 
@@ -92,7 +92,7 @@ static int w1_gpio_probe(struct platform_device *pdev)
                }
        }
 
-       pdata = pdev->dev.platform_data;
+       pdata = dev_get_platdata(&pdev->dev);
 
        if (!pdata) {
                dev_err(&pdev->dev, "No configuration data\n");
@@ -154,7 +154,7 @@ static int w1_gpio_probe(struct platform_device *pdev)
 static int w1_gpio_remove(struct platform_device *pdev)
 {
        struct w1_bus_master *master = platform_get_drvdata(pdev);
-       struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
+       struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev);
 
        if (pdata->enable_external_pullup)
                pdata->enable_external_pullup(0);
@@ -171,7 +171,7 @@ static int w1_gpio_remove(struct platform_device *pdev)
 
 static int w1_gpio_suspend(struct platform_device *pdev, pm_message_t state)
 {
-       struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
+       struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev);
 
        if (pdata->enable_external_pullup)
                pdata->enable_external_pullup(0);
@@ -181,7 +181,7 @@ static int w1_gpio_suspend(struct platform_device *pdev, pm_message_t state)
 
 static int w1_gpio_resume(struct platform_device *pdev)
 {
-       struct w1_gpio_platform_data *pdata = pdev->dev.platform_data;
+       struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev);
 
        if (pdata->enable_external_pullup)
                pdata->enable_external_pullup(1);
index 23eae5c..c794ea1 100644 (file)
@@ -140,7 +140,6 @@ config XEN_GRANT_DEV_ALLOC
 
 config SWIOTLB_XEN
        def_bool y
-       depends on PCI && X86
        select SWIOTLB
 
 config XEN_TMEM
index b232908..55ea73f 100644 (file)
@@ -596,7 +596,7 @@ static void __init balloon_add_region(unsigned long start_pfn,
        }
 }
 
-static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
+static int balloon_cpu_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
        int cpu = (long)hcpu;
@@ -616,7 +616,7 @@ static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block balloon_cpu_notifier __cpuinitdata = {
+static struct notifier_block balloon_cpu_notifier = {
        .notifier_call  = balloon_cpu_notify,
 };
 
@@ -641,7 +641,7 @@ static int __init balloon_init(void)
 
        balloon_stats.current_pages = xen_pv_domain()
                ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn)
-               : max_pfn;
+               : get_num_physpages();
        balloon_stats.target_pages  = balloon_stats.current_pages;
        balloon_stats.balloon_low   = 0;
        balloon_stats.balloon_high  = 0;
index 8b3a69a..5de2063 100644 (file)
@@ -305,7 +305,7 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
        if (rc < 0)
                goto err;
 
-       rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+       rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0,
                                       u->name, evtchn);
        if (rc < 0)
                goto err;
index c4d2298..62ccf54 100644 (file)
@@ -49,6 +49,7 @@
 #include <xen/grant_table.h>
 #include <xen/interface/memory.h>
 #include <xen/hvc-console.h>
+#include <xen/swiotlb-xen.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/interface.h>
 
@@ -898,8 +899,16 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
                        gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i,
                                                &map_ops[i].status, __func__);
 
-       if (xen_feature(XENFEAT_auto_translated_physmap))
+       /* this is basically a nop on x86 */
+       if (xen_feature(XENFEAT_auto_translated_physmap)) {
+               for (i = 0; i < count; i++) {
+                       if (map_ops[i].status)
+                               continue;
+                       set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT,
+                                       map_ops[i].dev_bus_addr >> PAGE_SHIFT);
+               }
                return ret;
+       }
 
        if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
                arch_enter_lazy_mmu_mode();
@@ -942,8 +951,14 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
        if (ret)
                return ret;
 
-       if (xen_feature(XENFEAT_auto_translated_physmap))
+       /* this is basically a nop on x86 */
+       if (xen_feature(XENFEAT_auto_translated_physmap)) {
+               for (i = 0; i < count; i++) {
+                       set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT,
+                                       INVALID_P2M_ENTRY);
+               }
                return ret;
+       }
 
        if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
                arch_enter_lazy_mmu_mode();
index 18fff88..d15f6e8 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include "../pci/pci.h"
+#include <asm/pci_x86.h>
 
 static bool __read_mostly pci_seg_supported = true;
 
@@ -192,3 +193,49 @@ static int __init register_xen_pci_notifier(void)
 }
 
 arch_initcall(register_xen_pci_notifier);
+
+#ifdef CONFIG_PCI_MMCONFIG
+static int __init xen_mcfg_late(void)
+{
+       struct pci_mmcfg_region *cfg;
+       int rc;
+
+       if (!xen_initial_domain())
+               return 0;
+
+       if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+               return 0;
+
+       if (list_empty(&pci_mmcfg_list))
+               return 0;
+
+       /* Check whether they are in the right area. */
+       list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+               struct physdev_pci_mmcfg_reserved r;
+
+               r.address = cfg->address;
+               r.segment = cfg->segment;
+               r.start_bus = cfg->start_bus;
+               r.end_bus = cfg->end_bus;
+               r.flags = XEN_PCI_MMCFG_RESERVED;
+
+               rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
+               switch (rc) {
+               case 0:
+               case -ENOSYS:
+                       continue;
+
+               default:
+                       pr_warn("Failed to report MMCONFIG reservation"
+                               " state for %s to hypervisor"
+                               " (%d)\n",
+                               cfg->name, rc);
+               }
+       }
+       return 0;
+}
+/*
+ * Needs to be done after acpi_init which are subsys_initcall.
+ */
+subsys_initcall_sync(xen_mcfg_late);
+#endif
index 99db9e1..2f3528e 100644 (file)
@@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
 static int xen_allocate_irq(struct pci_dev *pdev)
 {
        return request_irq(pdev->irq, do_hvm_evtchn_intr,
-                       IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+                       IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
                        "xen-platform-pci", pdev);
 }
 
index 1b2277c..a224bc7 100644 (file)
 #include <xen/page.h>
 #include <xen/xen-ops.h>
 #include <xen/hvc-console.h>
+
+#include <asm/dma-mapping.h>
+#include <asm/xen/page-coherent.h>
+
+#include <trace/events/swiotlb.h>
 /*
  * Used to do a quick range check in swiotlb_tbl_unmap_single and
  * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
  * API.
  */
 
+#ifndef CONFIG_X86
+static unsigned long dma_alloc_coherent_mask(struct device *dev,
+                                           gfp_t gfp)
+{
+       unsigned long dma_mask = 0;
+
+       dma_mask = dev->coherent_dma_mask;
+       if (!dma_mask)
+               dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32);
+
+       return dma_mask;
+}
+#endif
+
 static char *xen_io_tlb_start, *xen_io_tlb_end;
 static unsigned long xen_io_tlb_nslabs;
 /*
@@ -56,17 +75,17 @@ static unsigned long xen_io_tlb_nslabs;
 
 static u64 start_dma_addr;
 
-static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
+static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
 {
        return phys_to_machine(XPADDR(paddr)).maddr;
 }
 
-static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
+static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
 {
        return machine_to_phys(XMADDR(baddr)).paddr;
 }
 
-static dma_addr_t xen_virt_to_bus(void *address)
+static inline dma_addr_t xen_virt_to_bus(void *address)
 {
        return xen_phys_to_bus(virt_to_phys(address));
 }
@@ -89,7 +108,7 @@ static int check_pages_physically_contiguous(unsigned long pfn,
        return 1;
 }
 
-static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)
 {
        unsigned long pfn = PFN_DOWN(p);
        unsigned int offset = p & ~PAGE_MASK;
@@ -126,6 +145,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
 {
        int i, rc;
        int dma_bits;
+       dma_addr_t dma_handle;
+       phys_addr_t p = virt_to_phys(buf);
 
        dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
 
@@ -135,9 +156,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
 
                do {
                        rc = xen_create_contiguous_region(
-                               (unsigned long)buf + (i << IO_TLB_SHIFT),
+                               p + (i << IO_TLB_SHIFT),
                                get_order(slabs << IO_TLB_SHIFT),
-                               dma_bits);
+                               dma_bits, &dma_handle);
                } while (rc && dma_bits++ < max_dma_bits);
                if (rc)
                        return rc;
@@ -263,7 +284,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        void *ret;
        int order = get_order(size);
        u64 dma_mask = DMA_BIT_MASK(32);
-       unsigned long vstart;
        phys_addr_t phys;
        dma_addr_t dev_addr;
 
@@ -278,8 +298,12 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
                return ret;
 
-       vstart = __get_free_pages(flags, order);
-       ret = (void *)vstart;
+       /* On ARM this function returns an ioremap'ped virtual address for
+        * which virt_to_phys doesn't return the corresponding physical
+        * address. In fact on ARM virt_to_phys only works for kernel direct
+        * mapped RAM memory. Also see comment below.
+        */
+       ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs);
 
        if (!ret)
                return ret;
@@ -287,18 +311,21 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        if (hwdev && hwdev->coherent_dma_mask)
                dma_mask = dma_alloc_coherent_mask(hwdev, flags);
 
-       phys = virt_to_phys(ret);
+       /* At this point dma_handle is the physical address, next we are
+        * going to set it to the machine address.
+        * Do not use virt_to_phys(ret) because on ARM it doesn't correspond
+        * to *dma_handle. */
+       phys = *dma_handle;
        dev_addr = xen_phys_to_bus(phys);
        if (((dev_addr + size - 1 <= dma_mask)) &&
            !range_straddles_page_boundary(phys, size))
                *dma_handle = dev_addr;
        else {
-               if (xen_create_contiguous_region(vstart, order,
-                                                fls64(dma_mask)) != 0) {
-                       free_pages(vstart, order);
+               if (xen_create_contiguous_region(phys, order,
+                                                fls64(dma_mask), dma_handle) != 0) {
+                       xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs);
                        return NULL;
                }
-               *dma_handle = virt_to_machine(ret).maddr;
        }
        memset(ret, 0, size);
        return ret;
@@ -319,13 +346,15 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
        if (hwdev && hwdev->coherent_dma_mask)
                dma_mask = hwdev->coherent_dma_mask;
 
-       phys = virt_to_phys(vaddr);
+       /* do not use virt_to_phys because on ARM it doesn't return you the
+        * physical address */
+       phys = xen_bus_to_phys(dev_addr);
 
        if (((dev_addr + size - 1 > dma_mask)) ||
            range_straddles_page_boundary(phys, size))
-               xen_destroy_contiguous_region((unsigned long)vaddr, order);
+               xen_destroy_contiguous_region(phys, order);
 
-       free_pages((unsigned long)vaddr, order);
+       xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs);
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent);
 
@@ -352,16 +381,25 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
         * buffering it.
         */
        if (dma_capable(dev, dev_addr, size) &&
-           !range_straddles_page_boundary(phys, size) && !swiotlb_force)
+           !range_straddles_page_boundary(phys, size) && !swiotlb_force) {
+               /* we are not interested in the dma_addr returned by
+                * xen_dma_map_page, only in the potential cache flushes executed
+                * by the function. */
+               xen_dma_map_page(dev, page, offset, size, dir, attrs);
                return dev_addr;
+       }
 
        /*
         * Oh well, have to allocate and map a bounce buffer.
         */
+       trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
+
        map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
        if (map == SWIOTLB_MAP_ERROR)
                return DMA_ERROR_CODE;
 
+       xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT),
+                                       map & ~PAGE_MASK, size, dir, attrs);
        dev_addr = xen_phys_to_bus(map);
 
        /*
@@ -384,12 +422,15 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
  * whatever the device wrote there.
  */
 static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
-                            size_t size, enum dma_data_direction dir)
+                            size_t size, enum dma_data_direction dir,
+                                struct dma_attrs *attrs)
 {
        phys_addr_t paddr = xen_bus_to_phys(dev_addr);
 
        BUG_ON(dir == DMA_NONE);
 
+       xen_dma_unmap_page(hwdev, paddr, size, dir, attrs);
+
        /* NOTE: We use dev_addr here, not paddr! */
        if (is_xen_swiotlb_buffer(dev_addr)) {
                swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
@@ -412,7 +453,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
                            size_t size, enum dma_data_direction dir,
                            struct dma_attrs *attrs)
 {
-       xen_unmap_single(hwdev, dev_addr, size, dir);
+       xen_unmap_single(hwdev, dev_addr, size, dir, attrs);
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
 
@@ -435,11 +476,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 
        BUG_ON(dir == DMA_NONE);
 
+       if (target == SYNC_FOR_CPU)
+               xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir);
+
        /* NOTE: We use dev_addr here, not paddr! */
-       if (is_xen_swiotlb_buffer(dev_addr)) {
+       if (is_xen_swiotlb_buffer(dev_addr))
                swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
-               return;
-       }
+
+       if (target == SYNC_FOR_DEVICE)
+               xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir);
 
        if (dir != DMA_FROM_DEVICE)
                return;
@@ -502,16 +547,26 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
                                                                 sg->length,
                                                                 dir);
                        if (map == SWIOTLB_MAP_ERROR) {
+                               dev_warn(hwdev, "swiotlb buffer is full\n");
                                /* Don't panic here, we expect map_sg users
                                   to do proper error handling. */
                                xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
                                                           attrs);
                                sg_dma_len(sgl) = 0;
-                               return DMA_ERROR_CODE;
+                               return 0;
                        }
                        sg->dma_address = xen_phys_to_bus(map);
-               } else
+               } else {
+                       /* we are not interested in the dma_addr returned by
+                        * xen_dma_map_page, only in the potential cache flushes executed
+                        * by the function. */
+                       xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT),
+                                               paddr & ~PAGE_MASK,
+                                               sg->length,
+                                               dir,
+                                               attrs);
                        sg->dma_address = dev_addr;
+               }
                sg_dma_len(sg) = sg->length;
        }
        return nelems;
@@ -533,7 +588,7 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
        BUG_ON(dir == DMA_NONE);
 
        for_each_sg(sgl, sg, nelems, i)
-               xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir);
+               xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs);
 
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs);
@@ -593,3 +648,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
        return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported);
+
+int
+xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask)
+{
+       if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask))
+               return -EIO;
+
+       *dev->dma_mask = dma_mask;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask);
index a91a6a3..1a44e42 100644 (file)
@@ -14,4 +14,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
-btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
+       tests/extent-buffer-tests.o tests/btrfs-tests.o \
+       tests/extent-io-tests.o tests/inode-tests.o
index e15d2b0..0890c83 100644 (file)
@@ -229,7 +229,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
                if (ret > 0) {
                        /* we need an acl */
                        ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
-               } else {
+               } else if (ret < 0) {
                        cache_no_acl(inode);
                }
        } else {
index 08cc08f..8aec751 100644 (file)
@@ -262,7 +262,7 @@ static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
        struct btrfs_work *work = NULL;
        struct list_head *cur = NULL;
 
-       if(!list_empty(prio_head))
+       if (!list_empty(prio_head))
                cur = prio_head->next;
 
        smp_mb();
index 0552a59..3775947 100644 (file)
@@ -185,6 +185,9 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 {
        struct __prelim_ref *ref;
 
+       if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
+               return 0;
+
        ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
        if (!ref)
                return -ENOMEM;
@@ -323,8 +326,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 
        eb = path->nodes[level];
        while (!eb) {
-               if (!level) {
-                       WARN_ON(1);
+               if (WARN_ON(!level)) {
                        ret = 1;
                        goto out;
                }
@@ -1619,7 +1621,7 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                btrfs_release_path(path);
 
-               item = btrfs_item_nr(eb, slot);
+               item = btrfs_item_nr(slot);
                iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
 
                for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
index 71f074e..ac0b39d 100644 (file)
@@ -19,6 +19,7 @@
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
+#include <linux/hash.h>
 #include "extent_map.h"
 #include "extent_io.h"
 #include "ordered-data.h"
@@ -179,6 +180,25 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
        return container_of(inode, struct btrfs_inode, vfs_inode);
 }
 
+static inline unsigned long btrfs_inode_hash(u64 objectid,
+                                            const struct btrfs_root *root)
+{
+       u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+
+#if BITS_PER_LONG == 32
+       h = (h >> 32) ^ (h & 0xffffffff);
+#endif
+
+       return (unsigned long)h;
+}
+
+static inline void btrfs_insert_inode_hash(struct inode *inode)
+{
+       unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
+
+       __insert_inode_hash(inode, h);
+}
+
 static inline u64 btrfs_ino(struct inode *inode)
 {
        u64 ino = BTRFS_I(inode)->location.objectid;
index 1c47be1..e0aab44 100644 (file)
@@ -1038,7 +1038,7 @@ leaf_item_out_of_bounce_error:
                                                     disk_item_offset,
                                                     sizeof(struct btrfs_item));
                        item_offset = btrfs_stack_item_offset(&disk_item);
-                       item_size = btrfs_stack_item_offset(&disk_item);
+                       item_size = btrfs_stack_item_size(&disk_item);
                        disk_key = &disk_item.key;
                        type = btrfs_disk_key_type(disk_key);
 
@@ -1900,7 +1900,9 @@ again:
                                                               dev_state,
                                                               dev_bytenr);
                        }
-                       if (block->logical_bytenr != bytenr) {
+                       if (block->logical_bytenr != bytenr &&
+                           !(!block->is_metadata &&
+                             block->logical_bytenr == 0))
                                printk(KERN_INFO
                                       "Written block @%llu (%s/%llu/%d)"
                                       " found in hash table, %c,"
@@ -1910,15 +1912,14 @@ again:
                                       block->mirror_num,
                                       btrfsic_get_block_type(state, block),
                                       block->logical_bytenr);
-                               block->logical_bytenr = bytenr;
-                       } else if (state->print_mask &
-                                  BTRFSIC_PRINT_MASK_VERBOSE)
+                       else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                printk(KERN_INFO
                                       "Written block @%llu (%s/%llu/%d)"
                                       " found in hash table, %c.\n",
                                       bytenr, dev_state->name, dev_bytenr,
                                       block->mirror_num,
                                       btrfsic_get_block_type(state, block));
+                       block->logical_bytenr = bytenr;
                } else {
                        if (num_pages * PAGE_CACHE_SIZE <
                            state->datablock_size) {
@@ -2463,10 +2464,8 @@ static int btrfsic_process_written_superblock(
                }
        }
 
-       if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
-               WARN_ON(1);
+       if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
                btrfsic_dump_tree(state);
-       }
 
        return 0;
 }
@@ -2906,7 +2905,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                btrfsic_release_block_ctx(&block_ctx);
        }
 
-       if (!match) {
+       if (WARN_ON(!match)) {
                printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
                       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
                       " phys_bytenr=%llu)!\n",
@@ -2923,7 +2922,6 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                               bytenr, block_ctx.dev->name,
                               block_ctx.dev_bytenr, mirror_num);
                }
-               WARN_ON(1);
        }
 }
 
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
deleted file mode 100644 (file)
index 7c4503e..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _COMPAT_H_
-#define _COMPAT_H_
-
-#define btrfs_drop_nlink(inode) drop_nlink(inode)
-#define btrfs_inc_nlink(inode) inc_nlink(inode)
-
-#endif /* _COMPAT_H_ */
index 6aad98c..1499b27 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
 #include <linux/slab.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -360,7 +359,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
-       if(!bio) {
+       if (!bio) {
                kfree(cb);
                return -ENOMEM;
        }
index 61b5bcd..316136b 100644 (file)
@@ -274,7 +274,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        else
                btrfs_set_header_owner(cow, new_root_objectid);
 
-       write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
+       write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
                            BTRFS_FSID_SIZE);
 
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
@@ -996,7 +996,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        else
                btrfs_set_header_owner(cow, root->root_key.objectid);
 
-       write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
+       write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
                            BTRFS_FSID_SIZE);
 
        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
@@ -1285,11 +1285,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
                free_extent_buffer(eb_root);
                blocksize = btrfs_level_size(root, old_root->level);
                old = read_tree_block(root, logical, blocksize, 0);
-               if (!old || !extent_buffer_uptodate(old)) {
+               if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
                        free_extent_buffer(old);
                        pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
                                logical);
-                       WARN_ON(1);
                } else {
                        eb = btrfs_clone_extent_buffer(old);
                        free_extent_buffer(old);
@@ -2758,7 +2757,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
        int level;
        int lowest_unlock = 1;
        u8 lowest_level = 0;
-       int prev_cmp;
+       int prev_cmp = -1;
 
        lowest_level = p->lowest_level;
        WARN_ON(p->nodes[0] != NULL);
@@ -2769,7 +2768,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
        }
 
 again:
-       prev_cmp = -1;
        b = get_old_root(root, time_seq);
        level = btrfs_header_level(b);
        p->locks[level] = BTRFS_READ_LOCK;
@@ -2787,6 +2785,11 @@ again:
                 */
                btrfs_unlock_up_safe(p, level + 1);
 
+               /*
+                * Since we can unwind eb's we want to do a real search every
+                * time.
+                */
+               prev_cmp = -1;
                ret = key_search(b, key, level, &prev_cmp, &slot);
 
                if (level != 0) {
@@ -3148,7 +3151,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(c, root->root_key.objectid);
 
-       write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c),
+       write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
                            BTRFS_FSID_SIZE);
 
        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
@@ -3287,7 +3290,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(split, root->root_key.objectid);
        write_extent_buffer(split, root->fs_info->fsid,
-                           btrfs_header_fsid(split), BTRFS_FSID_SIZE);
+                           btrfs_header_fsid(), BTRFS_FSID_SIZE);
        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
                            btrfs_header_chunk_tree_uuid(split),
                            BTRFS_UUID_SIZE);
@@ -3337,8 +3340,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
        if (!nr)
                return 0;
        btrfs_init_map_token(&token);
-       start_item = btrfs_item_nr(l, start);
-       end_item = btrfs_item_nr(l, end);
+       start_item = btrfs_item_nr(start);
+       end_item = btrfs_item_nr(end);
        data_len = btrfs_token_item_offset(l, start_item, &token) +
                btrfs_token_item_size(l, start_item, &token);
        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
@@ -3406,7 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        slot = path->slots[1];
        i = left_nritems - 1;
        while (i >= nr) {
-               item = btrfs_item_nr(left, i);
+               item = btrfs_item_nr(i);
 
                if (!empty && push_items > 0) {
                        if (path->slots[0] > i)
@@ -3470,7 +3473,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        btrfs_set_header_nritems(right, right_nritems);
        push_space = BTRFS_LEAF_DATA_SIZE(root);
        for (i = 0; i < right_nritems; i++) {
-               item = btrfs_item_nr(right, i);
+               item = btrfs_item_nr(i);
                push_space -= btrfs_token_item_size(right, item, &token);
                btrfs_set_token_item_offset(right, item, push_space, &token);
        }
@@ -3612,7 +3615,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                nr = min(right_nritems - 1, max_slot);
 
        for (i = 0; i < nr; i++) {
-               item = btrfs_item_nr(right, i);
+               item = btrfs_item_nr(i);
 
                if (!empty && push_items > 0) {
                        if (path->slots[0] < i)
@@ -3639,8 +3642,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                ret = 1;
                goto out;
        }
-       if (!empty && push_items == btrfs_header_nritems(right))
-               WARN_ON(1);
+       WARN_ON(!empty && push_items == btrfs_header_nritems(right));
 
        /* push data from right to left */
        copy_extent_buffer(left, right,
@@ -3663,7 +3665,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
                u32 ioff;
 
-               item = btrfs_item_nr(left, i);
+               item = btrfs_item_nr(i);
 
                ioff = btrfs_token_item_offset(left, item, &token);
                btrfs_set_token_item_offset(left, item,
@@ -3694,7 +3696,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        btrfs_set_header_nritems(right, right_nritems);
        push_space = BTRFS_LEAF_DATA_SIZE(root);
        for (i = 0; i < right_nritems; i++) {
-               item = btrfs_item_nr(right, i);
+               item = btrfs_item_nr(i);
 
                push_space = push_space - btrfs_token_item_size(right,
                                                                item, &token);
@@ -3835,7 +3837,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
                      btrfs_item_end_nr(l, mid);
 
        for (i = 0; i < nritems; i++) {
-               struct btrfs_item *item = btrfs_item_nr(right, i);
+               struct btrfs_item *item = btrfs_item_nr(i);
                u32 ioff;
 
                ioff = btrfs_token_item_offset(right, item, &token);
@@ -4016,7 +4018,7 @@ again:
                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
                                        if (data_size && !tried_avoid_double)
                                                goto push_for_double;
-                                       split = 2 ;
+                                       split = 2;
                                }
                        }
                }
@@ -4042,7 +4044,7 @@ again:
        btrfs_set_header_owner(right, root->root_key.objectid);
        btrfs_set_header_level(right, 0);
        write_extent_buffer(right, root->fs_info->fsid,
-                           btrfs_header_fsid(right), BTRFS_FSID_SIZE);
+                           btrfs_header_fsid(), BTRFS_FSID_SIZE);
 
        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
                            btrfs_header_chunk_tree_uuid(right),
@@ -4177,7 +4179,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
 
        btrfs_set_path_blocking(path);
 
-       item = btrfs_item_nr(leaf, path->slots[0]);
+       item = btrfs_item_nr(path->slots[0]);
        orig_offset = btrfs_item_offset(leaf, item);
        item_size = btrfs_item_size(leaf, item);
 
@@ -4200,7 +4202,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
        btrfs_cpu_key_to_disk(&disk_key, new_key);
        btrfs_set_item_key(leaf, &disk_key, slot);
 
-       new_item = btrfs_item_nr(leaf, slot);
+       new_item = btrfs_item_nr(slot);
 
        btrfs_set_item_offset(leaf, new_item, orig_offset);
        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
@@ -4339,7 +4341,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
        /* first correct the data pointers */
        for (i = slot; i < nritems; i++) {
                u32 ioff;
-               item = btrfs_item_nr(leaf, i);
+               item = btrfs_item_nr(i);
 
                ioff = btrfs_token_item_offset(leaf, item, &token);
                btrfs_set_token_item_offset(leaf, item,
@@ -4387,7 +4389,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
                        fixup_low_keys(root, path, &disk_key, 1);
        }
 
-       item = btrfs_item_nr(leaf, slot);
+       item = btrfs_item_nr(slot);
        btrfs_set_item_size(leaf, item, new_size);
        btrfs_mark_buffer_dirty(leaf);
 
@@ -4441,7 +4443,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
        /* first correct the data pointers */
        for (i = slot; i < nritems; i++) {
                u32 ioff;
-               item = btrfs_item_nr(leaf, i);
+               item = btrfs_item_nr(i);
 
                ioff = btrfs_token_item_offset(leaf, item, &token);
                btrfs_set_token_item_offset(leaf, item,
@@ -4455,7 +4457,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 
        data_end = old_data;
        old_size = btrfs_item_size_nr(leaf, slot);
-       item = btrfs_item_nr(leaf, slot);
+       item = btrfs_item_nr(slot);
        btrfs_set_item_size(leaf, item, old_size + data_size);
        btrfs_mark_buffer_dirty(leaf);
 
@@ -4514,7 +4516,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
                for (i = slot; i < nritems; i++) {
                        u32 ioff;
 
-                       item = btrfs_item_nr(leaf, i);
+                       item = btrfs_item_nr( i);
                        ioff = btrfs_token_item_offset(leaf, item, &token);
                        btrfs_set_token_item_offset(leaf, item,
                                                    ioff - total_data, &token);
@@ -4535,7 +4537,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        for (i = 0; i < nr; i++) {
                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
                btrfs_set_item_key(leaf, &disk_key, slot + i);
-               item = btrfs_item_nr(leaf, slot + i);
+               item = btrfs_item_nr(slot + i);
                btrfs_set_token_item_offset(leaf, item,
                                            data_end - data_size[i], &token);
                data_end -= data_size[i];
@@ -4730,7 +4732,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                for (i = slot + nr; i < nritems; i++) {
                        u32 ioff;
 
-                       item = btrfs_item_nr(leaf, i);
+                       item = btrfs_item_nr(i);
                        ioff = btrfs_token_item_offset(leaf, item, &token);
                        btrfs_set_token_item_offset(leaf, item,
                                                    ioff + dsize, &token);
@@ -4823,14 +4825,18 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
 
-       if (key.offset > 0)
+       if (key.offset > 0) {
                key.offset--;
-       else if (key.type > 0)
+       } else if (key.type > 0) {
                key.type--;
-       else if (key.objectid > 0)
+               key.offset = (u64)-1;
+       } else if (key.objectid > 0) {
                key.objectid--;
-       else
+               key.type = (u8)-1;
+               key.offset = (u64)-1;
+       } else {
                return 1;
+       }
 
        btrfs_release_path(path);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -4866,7 +4872,6 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * was nothing in the tree that matched the search criteria.
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-                        struct btrfs_key *max_key,
                         struct btrfs_path *path,
                         u64 min_trans)
 {
@@ -4911,10 +4916,8 @@ again:
                 * If it is too old, old, skip to the next one.
                 */
                while (slot < nritems) {
-                       u64 blockptr;
                        u64 gen;
 
-                       blockptr = btrfs_node_blockptr(cur, slot);
                        gen = btrfs_node_ptr_generation(cur, slot);
                        if (gen < min_trans) {
                                slot++;
index 0506f40..aea4433 100644 (file)
@@ -47,6 +47,12 @@ extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define STATIC noinline
+#else
+#define STATIC static noinline
+#endif
+
 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
 #define BTRFS_MAX_MIRRORS 3
@@ -1580,7 +1586,6 @@ struct btrfs_fs_info {
        atomic_t scrubs_paused;
        atomic_t scrub_cancel_req;
        wait_queue_head_t scrub_pause_wait;
-       struct rw_semaphore scrub_super_lock;
        int scrub_workers_refcnt;
        struct btrfs_workers scrub_workers;
        struct btrfs_workers scrub_wr_completion_workers;
@@ -1724,7 +1729,9 @@ struct btrfs_root {
        int ref_cows;
        int track_dirty;
        int in_radix;
-
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+       int dummy_root;
+#endif
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
        struct btrfs_key defrag_max;
@@ -2461,8 +2468,7 @@ static inline unsigned long btrfs_item_nr_offset(int nr)
                sizeof(struct btrfs_item) * nr;
 }
 
-static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
-                                              int nr)
+static inline struct btrfs_item *btrfs_item_nr(int nr)
 {
        return (struct btrfs_item *)btrfs_item_nr_offset(nr);
 }
@@ -2475,30 +2481,30 @@ static inline u32 btrfs_item_end(struct extent_buffer *eb,
 
 static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
 {
-       return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+       return btrfs_item_end(eb, btrfs_item_nr(nr));
 }
 
 static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
 {
-       return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+       return btrfs_item_offset(eb, btrfs_item_nr(nr));
 }
 
 static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
 {
-       return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+       return btrfs_item_size(eb, btrfs_item_nr(nr));
 }
 
 static inline void btrfs_item_key(struct extent_buffer *eb,
                           struct btrfs_disk_key *disk_key, int nr)
 {
-       struct btrfs_item *item = btrfs_item_nr(eb, nr);
+       struct btrfs_item *item = btrfs_item_nr(nr);
        read_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
 static inline void btrfs_set_item_key(struct extent_buffer *eb,
                               struct btrfs_disk_key *disk_key, int nr)
 {
-       struct btrfs_item *item = btrfs_item_nr(eb, nr);
+       struct btrfs_item *item = btrfs_item_nr(nr);
        write_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
@@ -2666,7 +2672,7 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
        btrfs_set_header_flags(eb, flags);
 }
 
-static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_fsid(void)
 {
        return offsetof(struct btrfs_header, fsid);
 }
@@ -3308,7 +3314,6 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                        struct btrfs_key *key, int lowest_level,
                        u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-                        struct btrfs_key *max_key,
                         struct btrfs_path *path,
                         u64 min_trans);
 enum btrfs_compare_tree_result {
@@ -3675,8 +3680,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
-                                   int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3944,9 +3948,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                    u64 end, struct btrfs_scrub_progress *progress,
                    int readonly, int is_dev_replace);
 void btrfs_scrub_pause(struct btrfs_root *root);
-void btrfs_scrub_pause_super(struct btrfs_root *root);
 void btrfs_scrub_continue(struct btrfs_root *root);
-void btrfs_scrub_continue_super(struct btrfs_root *root);
 int btrfs_scrub_cancel(struct btrfs_fs_info *info);
 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
                           struct btrfs_device *dev);
@@ -4028,5 +4030,9 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
        return signal_pending(current);
 }
 
+/* Sanity test specific functions */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode);
+#endif
 
 #endif
index cbd9523..8d292fb 100644 (file)
@@ -108,8 +108,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
                        return node;
                }
                btrfs_inode->delayed_node = node;
-               atomic_inc(&node->refs);        /* can be accessed */
-               atomic_inc(&node->refs);        /* cached in the inode */
+               /* can be accessed and cached in the inode */
+               atomic_add(2, &node->refs);
                spin_unlock(&root->inode_lock);
                return node;
        }
@@ -138,8 +138,8 @@ again:
                return ERR_PTR(-ENOMEM);
        btrfs_init_delayed_node(node, root, ino);
 
-       atomic_inc(&node->refs);        /* cached in the btrfs inode */
-       atomic_inc(&node->refs);        /* can be accessed */
+       /* cached in the btrfs inode and can be accessed */
+       atomic_add(2, &node->refs);
 
        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
        if (ret) {
@@ -649,14 +649,13 @@ static int btrfs_delayed_inode_reserve_metadata(
                        goto out;
 
                ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-               if (!ret)
+               if (!WARN_ON(ret))
                        goto out;
 
                /*
                 * Ok this is a problem, let's just steal from the global rsv
                 * since this really shouldn't happen that often.
                 */
-               WARN_ON(1);
                ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
                                              dst_rsv, num_bytes);
                goto out;
@@ -771,13 +770,13 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
         */
        btrfs_set_path_blocking(path);
 
-       keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
+       keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
        if (!keys) {
                ret = -ENOMEM;
                goto out;
        }
 
-       data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
+       data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
        if (!data_size) {
                ret = -ENOMEM;
                goto error;
@@ -1174,8 +1173,10 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
        mutex_unlock(&delayed_node->mutex);
 
        path = btrfs_alloc_path();
-       if (!path)
+       if (!path) {
+               btrfs_release_delayed_node(delayed_node);
                return -ENOMEM;
+       }
        path->leave_spinning = 1;
 
        block_rsv = trans->block_rsv;
index 9efb94e..342f9fd 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -38,7 +37,6 @@
 #include "rcu-string.h"
 #include "dev-replace.h"
 
-static u64 btrfs_get_seconds_since_1970(void);
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                                       int scrub_ret);
 static void btrfs_dev_replace_update_device_in_mapping_tree(
@@ -296,13 +294,6 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
                dev_replace->cursor_left_last_write_of_item;
 }
 
-static u64 btrfs_get_seconds_since_1970(void)
-{
-       struct timespec t = CURRENT_TIME_SEC;
-
-       return t.tv_sec;
-}
-
 int btrfs_dev_replace_start(struct btrfs_root *root,
                            struct btrfs_ioctl_dev_replace_args *args)
 {
@@ -390,7 +381,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
         * go to the tgtdev as well (refer to btrfs_map_block()).
         */
        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
-       dev_replace->time_started = btrfs_get_seconds_since_1970();
+       dev_replace->time_started = get_seconds();
        dev_replace->cursor_left = 0;
        dev_replace->committed_cursor_left = 0;
        dev_replace->cursor_left_last_write_of_item = 0;
@@ -400,7 +391,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
        btrfs_dev_replace_unlock(dev_replace);
 
-       btrfs_wait_all_ordered_extents(root->fs_info);
+       btrfs_wait_ordered_roots(root->fs_info, -1);
 
        /* force writing the updated state information to disk */
        trans = btrfs_start_transaction(root, 0);
@@ -470,12 +461,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
-       ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+       ret = btrfs_start_delalloc_roots(root->fs_info, 0);
        if (ret) {
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return ret;
        }
-       btrfs_wait_all_ordered_extents(root->fs_info);
+       btrfs_wait_ordered_roots(root->fs_info, -1);
 
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
@@ -493,7 +484,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
        dev_replace->tgtdev = NULL;
        dev_replace->srcdev = NULL;
-       dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+       dev_replace->time_stopped = get_seconds();
        dev_replace->item_needs_writeback = 1;
 
        if (scrub_ret) {
@@ -650,6 +641,9 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
        u64 result;
        int ret;
 
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
        btrfs_dev_replace_lock(dev_replace);
        switch (dev_replace->replace_state) {
@@ -668,7 +662,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
                break;
        }
        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
-       dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+       dev_replace->time_stopped = get_seconds();
        dev_replace->item_needs_writeback = 1;
        btrfs_dev_replace_unlock(dev_replace);
        btrfs_scrub_cancel(fs_info);
@@ -703,7 +697,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
                dev_replace->replace_state =
                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
-               dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+               dev_replace->time_stopped = get_seconds();
                dev_replace->item_needs_writeback = 1;
                pr_info("btrfs: suspending dev_replace for unmount\n");
                break;
index 79e594e..c031ea3 100644 (file)
@@ -58,7 +58,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
                return ERR_PTR(ret);
        WARN_ON(ret > 0);
        leaf = path->nodes[0];
-       item = btrfs_item_nr(leaf, path->slots[0]);
+       item = btrfs_item_nr(path->slots[0]);
        ptr = btrfs_item_ptr(leaf, path->slots[0], char);
        BUG_ON(data_size > btrfs_item_size(leaf, item));
        ptr += btrfs_item_size(leaf, item) - data_size;
@@ -474,8 +474,10 @@ int verify_dir_item(struct btrfs_root *root,
        }
 
        /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
-       if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
-               printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
+       if ((btrfs_dir_data_len(leaf, dir_item) +
+            btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
+               printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n",
+                      (unsigned)btrfs_dir_name_len(leaf, dir_item),
                       (unsigned)btrfs_dir_data_len(leaf, dir_item));
                return 1;
        }
index 62176ad..4c4ed0b 100644 (file)
@@ -33,7 +33,6 @@
 #include <linux/uuid.h>
 #include <linux/semaphore.h>
 #include <asm/unaligned.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -64,7 +63,6 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                      struct btrfs_root *root);
-static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
                                        struct extent_io_tree *dirty_pages,
@@ -477,14 +475,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        if (page != eb->pages[0])
                return 0;
        found_start = btrfs_header_bytenr(eb);
-       if (found_start != start) {
-               WARN_ON(1);
+       if (WARN_ON(found_start != start || !PageUptodate(page)))
                return 0;
-       }
-       if (!PageUptodate(page)) {
-               WARN_ON(1);
-               return 0;
-       }
        csum_tree_block(root, eb, 0);
        return 0;
 }
@@ -496,7 +488,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
        u8 fsid[BTRFS_UUID_SIZE];
        int ret = 1;
 
-       read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE);
+       read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
        while (fs_devices) {
                if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
                        ret = 0;
@@ -1105,8 +1097,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 {
        struct inode *btree_inode = root->fs_info->btree_inode;
        struct extent_buffer *eb;
-       eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-                               bytenr, blocksize);
+       eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr);
        return eb;
 }
 
@@ -1229,14 +1220,18 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        atomic_set(&root->refs, 1);
        root->log_transid = 0;
        root->last_log_commit = 0;
-       extent_io_tree_init(&root->dirty_log_pages,
-                            fs_info->btree_inode->i_mapping);
+       if (fs_info)
+               extent_io_tree_init(&root->dirty_log_pages,
+                                    fs_info->btree_inode->i_mapping);
 
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
        memset(&root->root_kobj, 0, sizeof(root->root_kobj));
-       root->defrag_trans_start = fs_info->generation;
+       if (fs_info)
+               root->defrag_trans_start = fs_info->generation;
+       else
+               root->defrag_trans_start = 0;
        init_completion(&root->kobj_unregister);
        root->defrag_running = 0;
        root->root_key.objectid = objectid;
@@ -1253,6 +1248,22 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
        return root;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/* Should only be used by the testing infrastructure */
+struct btrfs_root *btrfs_alloc_dummy_root(void)
+{
+       struct btrfs_root *root;
+
+       root = btrfs_alloc_root(NULL);
+       if (!root)
+               return ERR_PTR(-ENOMEM);
+       __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+       root->dummy_root = 1;
+
+       return root;
+}
+#endif
+
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
                                     struct btrfs_fs_info *fs_info,
                                     u64 objectid)
@@ -1292,7 +1303,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
        btrfs_set_header_owner(leaf, objectid);
        root->node = leaf;
 
-       write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf),
+       write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
                            BTRFS_FSID_SIZE);
        write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
                            btrfs_header_chunk_tree_uuid(leaf),
@@ -1379,7 +1390,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        root->node = leaf;
 
        write_extent_buffer(root->node, root->fs_info->fsid,
-                           btrfs_header_fsid(root->node), BTRFS_FSID_SIZE);
+                           btrfs_header_fsid(), BTRFS_FSID_SIZE);
        btrfs_mark_buffer_dirty(root->node);
        btrfs_tree_unlock(root->node);
        return root;
@@ -1780,6 +1791,9 @@ sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
+               if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
+                                     &root->fs_info->fs_state)))
+                       btrfs_cleanup_transaction(root);
                if (!try_to_freeze()) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (!kthread_should_stop() &&
@@ -2013,50 +2027,28 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
        btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
 }
 
+static void free_root_extent_buffers(struct btrfs_root *root)
+{
+       if (root) {
+               free_extent_buffer(root->node);
+               free_extent_buffer(root->commit_root);
+               root->node = NULL;
+               root->commit_root = NULL;
+       }
+}
+
 /* helper to cleanup tree roots */
 static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 {
-       free_extent_buffer(info->tree_root->node);
-       free_extent_buffer(info->tree_root->commit_root);
-       info->tree_root->node = NULL;
-       info->tree_root->commit_root = NULL;
-
-       if (info->dev_root) {
-               free_extent_buffer(info->dev_root->node);
-               free_extent_buffer(info->dev_root->commit_root);
-               info->dev_root->node = NULL;
-               info->dev_root->commit_root = NULL;
-       }
-       if (info->extent_root) {
-               free_extent_buffer(info->extent_root->node);
-               free_extent_buffer(info->extent_root->commit_root);
-               info->extent_root->node = NULL;
-               info->extent_root->commit_root = NULL;
-       }
-       if (info->csum_root) {
-               free_extent_buffer(info->csum_root->node);
-               free_extent_buffer(info->csum_root->commit_root);
-               info->csum_root->node = NULL;
-               info->csum_root->commit_root = NULL;
-       }
-       if (info->quota_root) {
-               free_extent_buffer(info->quota_root->node);
-               free_extent_buffer(info->quota_root->commit_root);
-               info->quota_root->node = NULL;
-               info->quota_root->commit_root = NULL;
-       }
-       if (info->uuid_root) {
-               free_extent_buffer(info->uuid_root->node);
-               free_extent_buffer(info->uuid_root->commit_root);
-               info->uuid_root->node = NULL;
-               info->uuid_root->commit_root = NULL;
-       }
-       if (chunk_root) {
-               free_extent_buffer(info->chunk_root->node);
-               free_extent_buffer(info->chunk_root->commit_root);
-               info->chunk_root->node = NULL;
-               info->chunk_root->commit_root = NULL;
-       }
+       free_root_extent_buffers(info->tree_root);
+
+       free_root_extent_buffers(info->dev_root);
+       free_root_extent_buffers(info->extent_root);
+       free_root_extent_buffers(info->csum_root);
+       free_root_extent_buffers(info->quota_root);
+       free_root_extent_buffers(info->uuid_root);
+       if (chunk_root)
+               free_root_extent_buffers(info->chunk_root);
 }
 
 static void del_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2230,7 +2222,6 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->scrubs_paused, 0);
        atomic_set(&fs_info->scrub_cancel_req, 0);
        init_waitqueue_head(&fs_info->scrub_pause_wait);
-       init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        fs_info->check_integrity_print_mask = 0;
@@ -2272,7 +2263,7 @@ int open_ctree(struct super_block *sb,
               sizeof(struct btrfs_key));
        set_bit(BTRFS_INODE_DUMMY,
                &BTRFS_I(fs_info->btree_inode)->runtime_flags);
-       insert_inode_hash(fs_info->btree_inode);
+       btrfs_insert_inode_hash(fs_info->btree_inode);
 
        spin_lock_init(&fs_info->block_group_cache_lock);
        fs_info->block_group_cache_tree = RB_ROOT;
@@ -2670,6 +2661,7 @@ retry_root_backup:
 
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
+       btrfs_set_root_refs(&tree_root->root_item, 1);
 
        location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
        location.type = BTRFS_ROOT_ITEM_KEY;
@@ -3448,10 +3440,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors)
 {
-       int ret;
-
-       ret = write_all_supers(root, max_mirrors);
-       return ret;
+       return write_all_supers(root, max_mirrors);
 }
 
 /* Drop a fs root from the radix tree and free it. */
@@ -3614,12 +3603,12 @@ int close_ctree(struct btrfs_root *root)
                       percpu_counter_sum(&fs_info->delalloc_bytes));
        }
 
+       del_fs_roots(fs_info);
+
        btrfs_free_block_groups(fs_info);
 
        btrfs_stop_all_workers(fs_info);
 
-       del_fs_roots(fs_info);
-
        free_root_pointers(fs_info, 1);
 
        iput(fs_info->btree_inode);
@@ -3669,10 +3658,20 @@ int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-       struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+       struct btrfs_root *root;
        u64 transid = btrfs_header_generation(buf);
        int was_dirty;
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+       /*
+        * This is a fast path so only do this check if we have sanity tests
+        * enabled.  Normal people shouldn't be marking dummy buffers as dirty
+        * outside of the sanity tests.
+        */
+       if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
+               return;
+#endif
+       root = BTRFS_I(buf->pages[0]->mapping->host)->root;
        btrfs_assert_tree_locked(buf);
        if (transid != root->fs_info->generation)
                WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -3802,7 +3801,8 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
        while (!list_empty(&splice)) {
                root = list_first_entry(&splice, struct btrfs_root,
                                        ordered_root);
-               list_del_init(&root->ordered_root);
+               list_move_tail(&root->ordered_root,
+                              &fs_info->ordered_roots);
 
                btrfs_destroy_ordered_extents(root);
 
@@ -3880,24 +3880,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
        return ret;
 }
 
-static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
-{
-       struct btrfs_pending_snapshot *snapshot;
-       struct list_head splice;
-
-       INIT_LIST_HEAD(&splice);
-
-       list_splice_init(&t->pending_snapshots, &splice);
-
-       while (!list_empty(&splice)) {
-               snapshot = list_entry(splice.next,
-                                     struct btrfs_pending_snapshot,
-                                     list);
-               snapshot->error = -ECANCELED;
-               list_del_init(&snapshot->list);
-       }
-}
-
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 {
        struct btrfs_inode *btrfs_inode;
@@ -4027,15 +4009,13 @@ again:
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                   struct btrfs_root *root)
 {
+       btrfs_destroy_ordered_operations(cur_trans, root);
+
        btrfs_destroy_delayed_refs(cur_trans, root);
-       btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
-                               cur_trans->dirty_pages.dirty_bytes);
 
        cur_trans->state = TRANS_STATE_COMMIT_START;
        wake_up(&root->fs_info->transaction_blocked_wait);
 
-       btrfs_evict_pending_snapshots(cur_trans);
-
        cur_trans->state = TRANS_STATE_UNBLOCKED;
        wake_up(&root->fs_info->transaction_wait);
 
@@ -4059,63 +4039,51 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 static int btrfs_cleanup_transaction(struct btrfs_root *root)
 {
        struct btrfs_transaction *t;
-       LIST_HEAD(list);
 
        mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
        spin_lock(&root->fs_info->trans_lock);
-       list_splice_init(&root->fs_info->trans_list, &list);
-       root->fs_info->running_transaction = NULL;
-       spin_unlock(&root->fs_info->trans_lock);
-
-       while (!list_empty(&list)) {
-               t = list_entry(list.next, struct btrfs_transaction, list);
-
-               btrfs_destroy_ordered_operations(t, root);
-
-               btrfs_destroy_all_ordered_extents(root->fs_info);
-
-               btrfs_destroy_delayed_refs(t, root);
-
-               /*
-                *  FIXME: cleanup wait for commit
-                *  We needn't acquire the lock here, because we are during
-                *  the umount, there is no other task which will change it.
-                */
-               t->state = TRANS_STATE_COMMIT_START;
-               smp_mb();
-               if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
-                       wake_up(&root->fs_info->transaction_blocked_wait);
-
-               btrfs_evict_pending_snapshots(t);
-
-               t->state = TRANS_STATE_UNBLOCKED;
-               smp_mb();
-               if (waitqueue_active(&root->fs_info->transaction_wait))
-                       wake_up(&root->fs_info->transaction_wait);
-
-               btrfs_destroy_delayed_inodes(root);
-               btrfs_assert_delayed_root_empty(root);
-
-               btrfs_destroy_all_delalloc_inodes(root->fs_info);
-
-               btrfs_destroy_marked_extents(root, &t->dirty_pages,
-                                            EXTENT_DIRTY);
-
-               btrfs_destroy_pinned_extent(root,
-                                           root->fs_info->pinned_extents);
-
-               t->state = TRANS_STATE_COMPLETED;
-               smp_mb();
-               if (waitqueue_active(&t->commit_wait))
-                       wake_up(&t->commit_wait);
+       while (!list_empty(&root->fs_info->trans_list)) {
+               t = list_first_entry(&root->fs_info->trans_list,
+                                    struct btrfs_transaction, list);
+               if (t->state >= TRANS_STATE_COMMIT_START) {
+                       atomic_inc(&t->use_count);
+                       spin_unlock(&root->fs_info->trans_lock);
+                       btrfs_wait_for_commit(root, t->transid);
+                       btrfs_put_transaction(t);
+                       spin_lock(&root->fs_info->trans_lock);
+                       continue;
+               }
+               if (t == root->fs_info->running_transaction) {
+                       t->state = TRANS_STATE_COMMIT_DOING;
+                       spin_unlock(&root->fs_info->trans_lock);
+                       /*
+                        * We wait for 0 num_writers since we don't hold a trans
+                        * handle open currently for this transaction.
+                        */
+                       wait_event(t->writer_wait,
+                                  atomic_read(&t->num_writers) == 0);
+               } else {
+                       spin_unlock(&root->fs_info->trans_lock);
+               }
+               btrfs_cleanup_one_transaction(t, root);
 
-               atomic_set(&t->use_count, 0);
+               spin_lock(&root->fs_info->trans_lock);
+               if (t == root->fs_info->running_transaction)
+                       root->fs_info->running_transaction = NULL;
                list_del_init(&t->list);
-               memset(t, 0, sizeof(*t));
-               kmem_cache_free(btrfs_transaction_cachep, t);
-       }
+               spin_unlock(&root->fs_info->trans_lock);
 
+               btrfs_put_transaction(t);
+               trace_btrfs_transaction_commit(root);
+               spin_lock(&root->fs_info->trans_lock);
+       }
+       spin_unlock(&root->fs_info->trans_lock);
+       btrfs_destroy_all_ordered_extents(root->fs_info);
+       btrfs_destroy_delayed_inodes(root);
+       btrfs_assert_delayed_root_empty(root);
+       btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
+       btrfs_destroy_all_delalloc_inodes(root->fs_info);
        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
        return 0;
index 5ce2a7d..53059df 100644 (file)
@@ -86,6 +86,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
                                 struct btrfs_root *root);
 void btrfs_free_fs_root(struct btrfs_root *root);
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_root *btrfs_alloc_dummy_root(void);
+#endif
+
 /*
  * This function is used to grab the root, and avoid it is freed when we
  * access it. But it doesn't ensure that the tree is not dropped.
index 4b86916..41422a3 100644 (file)
@@ -5,7 +5,6 @@
 #include "btrfs_inode.h"
 #include "print-tree.h"
 #include "export.h"
-#include "compat.h"
 
 #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
                                                 parent_objectid) / 4)
index d58bef1..45d98d0 100644 (file)
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
-#include "compat.h"
 #include "hash.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -1551,9 +1550,8 @@ again:
        if (ret && !insert) {
                err = -ENOENT;
                goto out;
-       } else if (ret) {
+       } else if (WARN_ON(ret)) {
                err = -EIO;
-               WARN_ON(1);
                goto out;
        }
 
@@ -1979,7 +1977,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        struct btrfs_extent_item *item;
        u64 refs;
        int ret;
-       int err = 0;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -1992,14 +1989,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                           path, bytenr, num_bytes, parent,
                                           root_objectid, owner, offset,
                                           refs_to_add, extent_op);
-       if (ret == 0)
+       if (ret != -EAGAIN)
                goto out;
 
-       if (ret != -EAGAIN) {
-               err = ret;
-               goto out;
-       }
-
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, item);
@@ -2021,7 +2013,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                btrfs_abort_transaction(trans, root, ret);
 out:
        btrfs_free_path(path);
-       return err;
+       return ret;
 }
 
 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
@@ -2137,15 +2129,28 @@ again:
        }
        if (ret > 0) {
                if (metadata) {
-                       btrfs_release_path(path);
-                       metadata = 0;
+                       if (path->slots[0] > 0) {
+                               path->slots[0]--;
+                               btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                                     path->slots[0]);
+                               if (key.objectid == node->bytenr &&
+                                   key.type == BTRFS_EXTENT_ITEM_KEY &&
+                                   key.offset == node->num_bytes)
+                                       ret = 0;
+                       }
+                       if (ret > 0) {
+                               btrfs_release_path(path);
+                               metadata = 0;
 
-                       key.offset = node->num_bytes;
-                       key.type = BTRFS_EXTENT_ITEM_KEY;
-                       goto again;
+                               key.objectid = node->bytenr;
+                               key.offset = node->num_bytes;
+                               key.type = BTRFS_EXTENT_ITEM_KEY;
+                               goto again;
+                       }
+               } else {
+                       err = -EIO;
+                       goto out;
                }
-               err = -EIO;
-               goto out;
        }
 
        leaf = path->nodes[0];
@@ -2234,8 +2239,12 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
 
-       if (trans->aborted)
+       if (trans->aborted) {
+               if (insert_reserved)
+                       btrfs_pin_extent(root, node->bytenr,
+                                        node->num_bytes, 1);
                return 0;
+       }
 
        if (btrfs_delayed_ref_is_head(node)) {
                struct btrfs_delayed_ref_head *head;
@@ -2411,6 +2420,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                btrfs_free_delayed_extent_op(extent_op);
 
                                if (ret) {
+                                       /*
+                                        * Need to reset must_insert_reserved if
+                                        * there was an error so the abort stuff
+                                        * can cleanup the reserved space
+                                        * properly.
+                                        */
+                                       if (must_insert_reserved)
+                                               locked_ref->must_insert_reserved = 1;
                                        btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
                                        spin_lock(&delayed_refs->lock);
                                        btrfs_delayed_ref_unlock(locked_ref);
@@ -3197,8 +3214,7 @@ again:
                if (ret)
                        goto out_put;
 
-               ret = btrfs_truncate_free_space_cache(root, trans, path,
-                                                     inode);
+               ret = btrfs_truncate_free_space_cache(root, trans, inode);
                if (ret)
                        goto out_put;
        }
@@ -3318,10 +3334,9 @@ again:
                last = cache->key.objectid + cache->key.offset;
 
                err = write_one_cache_group(trans, root, path, cache);
+               btrfs_put_block_group(cache);
                if (err) /* File system offline */
                        goto out;
-
-               btrfs_put_block_group(cache);
        }
 
        while (1) {
@@ -3605,10 +3620,9 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
        /* make sure bytes are sectorsize aligned */
        bytes = ALIGN(bytes, root->sectorsize);
 
-       if (root == root->fs_info->tree_root ||
-           BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
-               alloc_chunk = 0;
+       if (btrfs_is_free_space_inode(inode)) {
                committed = 1;
+               ASSERT(current->journal_info);
        }
 
        data_sinfo = fs_info->data_sinfo;
@@ -3636,6 +3650,16 @@ again:
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
+                       /*
+                        * It is ugly that we don't call nolock join
+                        * transaction for the free space inode case here.
+                        * But it is safe because we only do the data space
+                        * reservation for the free space cache in the
+                        * transaction context, the common join transaction
+                        * just increase the counter of the current transaction
+                        * handler, doesn't try to acquire the trans_lock of
+                        * the fs.
+                        */
                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
@@ -3681,6 +3705,9 @@ commit_trans:
                        goto again;
                }
 
+               trace_btrfs_space_reservation(root->fs_info,
+                                             "space_info:enospc",
+                                             data_sinfo->flags, bytes, 1);
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -3989,12 +4016,26 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                 * the filesystem is readonly(all dirty pages are written to
                 * the disk).
                 */
-               btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+               btrfs_start_delalloc_roots(root->fs_info, 0);
                if (!current->journal_info)
-                       btrfs_wait_all_ordered_extents(root->fs_info);
+                       btrfs_wait_ordered_roots(root->fs_info, -1);
        }
 }
 
+static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
+{
+       u64 bytes;
+       int nr;
+
+       bytes = btrfs_calc_trans_metadata_size(root, 1);
+       nr = (int)div64_u64(to_reclaim, bytes);
+       if (!nr)
+               nr = 1;
+       return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM   (256 * 1024)
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -4007,24 +4048,30 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        u64 delalloc_bytes;
        u64 max_reclaim;
        long time_left;
-       unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
-       int loops = 0;
+       unsigned long nr_pages;
+       int loops;
+       int items;
        enum btrfs_reserve_flush_enum flush;
 
+       /* Calc the number of the pages we need flush for space reservation */
+       items = calc_reclaim_items_nr(root, to_reclaim);
+       to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+
        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
 
-       smp_mb();
        delalloc_bytes = percpu_counter_sum_positive(
                                                &root->fs_info->delalloc_bytes);
        if (delalloc_bytes == 0) {
                if (trans)
                        return;
-               btrfs_wait_all_ordered_extents(root->fs_info);
+               if (wait_ordered)
+                       btrfs_wait_ordered_roots(root->fs_info, items);
                return;
        }
 
+       loops = 0;
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
@@ -4033,9 +4080,19 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                 * We need to wait for the async pages to actually start before
                 * we do anything.
                 */
-               wait_event(root->fs_info->async_submit_wait,
-                          !atomic_read(&root->fs_info->async_delalloc_pages));
+               max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
+               if (!max_reclaim)
+                       goto skip_async;
+
+               if (max_reclaim <= nr_pages)
+                       max_reclaim = 0;
+               else
+                       max_reclaim -= nr_pages;
 
+               wait_event(root->fs_info->async_submit_wait,
+                          atomic_read(&root->fs_info->async_delalloc_pages) <=
+                          (int)max_reclaim);
+skip_async:
                if (!trans)
                        flush = BTRFS_RESERVE_FLUSH_ALL;
                else
@@ -4049,13 +4106,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
                loops++;
                if (wait_ordered && !trans) {
-                       btrfs_wait_all_ordered_extents(root->fs_info);
+                       btrfs_wait_ordered_roots(root->fs_info, items);
                } else {
                        time_left = schedule_timeout_killable(1);
                        if (time_left)
                                break;
                }
-               smp_mb();
                delalloc_bytes = percpu_counter_sum_positive(
                                                &root->fs_info->delalloc_bytes);
        }
@@ -4140,16 +4196,11 @@ static int flush_space(struct btrfs_root *root,
        switch (state) {
        case FLUSH_DELAYED_ITEMS_NR:
        case FLUSH_DELAYED_ITEMS:
-               if (state == FLUSH_DELAYED_ITEMS_NR) {
-                       u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
-
-                       nr = (int)div64_u64(num_bytes, bytes);
-                       if (!nr)
-                               nr = 1;
-                       nr *= 2;
-               } else {
+               if (state == FLUSH_DELAYED_ITEMS_NR)
+                       nr = calc_reclaim_items_nr(root, num_bytes) * 2;
+               else
                        nr = -1;
-               }
+
                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
@@ -4332,6 +4383,10 @@ out:
                    !block_rsv_use_bytes(global_rsv, orig_bytes))
                        ret = 0;
        }
+       if (ret == -ENOSPC)
+               trace_btrfs_space_reservation(root->fs_info,
+                                             "space_info:enospc",
+                                             space_info->flags, orig_bytes, 1);
        if (flushing) {
                spin_lock(&space_info->lock);
                space_info->flush = 0;
@@ -4986,7 +5041,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
        if (to_reserve)
-               trace_btrfs_space_reservation(root->fs_info,"delalloc",
+               trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                              btrfs_ino(inode), to_reserve, 1);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
@@ -5264,6 +5319,8 @@ static int pin_down_extent(struct btrfs_root *root,
 
        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+       if (reserved)
+               trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
        return 0;
 }
 
@@ -5718,9 +5775,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                        extent_slot = path->slots[0];
                }
-       } else if (ret == -ENOENT) {
+       } else if (WARN_ON(ret == -ENOENT)) {
                btrfs_print_leaf(extent_root, path->nodes[0]);
-               WARN_ON(1);
                btrfs_err(info,
                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
                        bytenr, parent, root_objectid, owner_objectid,
@@ -5967,6 +6023,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
                btrfs_add_free_space(cache, buf->start, buf->len);
                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+               trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
                pin = 0;
        }
 out:
@@ -6594,8 +6651,6 @@ again:
                }
        }
 
-       trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
-
        return ret;
 }
 
@@ -6707,6 +6762,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                        ins->objectid, ins->offset);
                BUG();
        }
+       trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
        return ret;
 }
 
@@ -6731,13 +6787,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                size += sizeof(*block_info);
 
        path = btrfs_alloc_path();
-       if (!path)
+       if (!path) {
+               btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+                                                  root->leafsize);
                return -ENOMEM;
+       }
 
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
                                      ins, size);
        if (ret) {
+               btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+                                                  root->leafsize);
                btrfs_free_path(path);
                return ret;
        }
@@ -6779,6 +6840,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                        ins->objectid, ins->offset);
                BUG();
        }
+
+       trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
        return ret;
 }
 
@@ -7983,7 +8046,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 
        spin_lock(&sinfo->lock);
 
-       for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
                if (!list_empty(&sinfo->block_groups[i]))
                        free_bytes += __btrfs_get_ro_block_group_free_space(
                                                &sinfo->block_groups[i]);
@@ -8271,15 +8334,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 
        release_global_block_rsv(info);
 
-       while(!list_empty(&info->space_info)) {
+       while (!list_empty(&info->space_info)) {
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
                if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
-                       if (space_info->bytes_pinned > 0 ||
+                       if (WARN_ON(space_info->bytes_pinned > 0 ||
                            space_info->bytes_reserved > 0 ||
-                           space_info->bytes_may_use > 0) {
-                               WARN_ON(1);
+                           space_info->bytes_may_use > 0)) {
                                dump_space_info(space_info, 0, 0);
                        }
                }
index 51731b7..856bc2b 100644 (file)
 #include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
-#include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "check-integrity.h"
 #include "locking.h"
 #include "rcu-string.h"
+#include "backref.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1597,11 +1597,10 @@ done:
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct inode *inode,
-                                            struct extent_io_tree *tree,
-                                            struct page *locked_page,
-                                            u64 *start, u64 *end,
-                                            u64 max_bytes)
+STATIC u64 find_lock_delalloc_range(struct inode *inode,
+                                   struct extent_io_tree *tree,
+                                   struct page *locked_page, u64 *start,
+                                   u64 *end, u64 max_bytes)
 {
        u64 delalloc_start;
        u64 delalloc_end;
@@ -1740,10 +1739,8 @@ u64 count_range_bits(struct extent_io_tree *tree,
        u64 last = 0;
        int found = 0;
 
-       if (search_end <= cur_start) {
-               WARN_ON(1);
+       if (WARN_ON(search_end <= cur_start))
                return 0;
-       }
 
        spin_lock(&tree->lock);
        if (cur_start == 0 && bits == EXTENT_DIRTY) {
@@ -3569,9 +3566,8 @@ retry:
                         * but no sense in crashing the users box for something
                         * we can survive anyway.
                         */
-                       if (!eb) {
+                       if (WARN_ON(!eb)) {
                                spin_unlock(&mapping->private_lock);
-                               WARN_ON(1);
                                continue;
                        }
 
@@ -4038,7 +4034,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
        if (offset >= last)
                return NULL;
 
-       while(1) {
+       while (1) {
                len = last - offset;
                if (len == 0)
                        break;
@@ -4062,6 +4058,19 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
        return NULL;
 }
 
+static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
+{
+       unsigned long cnt = *((unsigned long *)ctx);
+
+       cnt++;
+       *((unsigned long *)ctx) = cnt;
+
+       /* Now we're sure that the extent is shared. */
+       if (cnt > 1)
+               return 1;
+       return 0;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -4128,7 +4137,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                last = found_key.offset;
                last_for_get_extent = last + 1;
        }
-       btrfs_free_path(path);
+       btrfs_release_path(path);
 
        /*
         * we might have some extents allocated but more delalloc past those
@@ -4198,7 +4207,24 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                } else {
+                       unsigned long ref_cnt = 0;
+
                        disko = em->block_start + offset_in_extent;
+
+                       /*
+                        * As btrfs supports shared space, this information
+                        * can be exported to userspace tools via
+                        * flag FIEMAP_EXTENT_SHARED.
+                        */
+                       ret = iterate_inodes_from_logical(
+                                       em->block_start,
+                                       BTRFS_I(inode)->root->fs_info,
+                                       path, count_ext_ref, &ref_cnt);
+                       if (ret < 0 && ret != -ENOENT)
+                               goto out_free;
+
+                       if (ref_cnt > 1)
+                               flags |= FIEMAP_EXTENT_SHARED;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
@@ -4230,6 +4256,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 out_free:
        free_extent_map(em);
 out:
+       btrfs_free_path(path);
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
                             &cached_state, GFP_NOFS);
        return ret;
@@ -4455,6 +4482,23 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
        }
 }
 
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+                                                       u64 start)
+{
+       struct extent_buffer *eb;
+
+       rcu_read_lock();
+       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+       if (eb && atomic_inc_not_zero(&eb->refs)) {
+               rcu_read_unlock();
+               mark_extent_buffer_accessed(eb);
+               return eb;
+       }
+       rcu_read_unlock();
+
+       return NULL;
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len)
 {
@@ -4468,14 +4512,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        int uptodate = 1;
        int ret;
 
-       rcu_read_lock();
-       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-       if (eb && atomic_inc_not_zero(&eb->refs)) {
-               rcu_read_unlock();
-               mark_extent_buffer_accessed(eb);
+
+       eb = find_extent_buffer(tree, start);
+       if (eb)
                return eb;
-       }
-       rcu_read_unlock();
 
        eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
        if (!eb)
@@ -4534,24 +4574,17 @@ again:
 
        spin_lock(&tree->buffer_lock);
        ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+       spin_unlock(&tree->buffer_lock);
+       radix_tree_preload_end();
        if (ret == -EEXIST) {
-               exists = radix_tree_lookup(&tree->buffer,
-                                               start >> PAGE_CACHE_SHIFT);
-               if (!atomic_inc_not_zero(&exists->refs)) {
-                       spin_unlock(&tree->buffer_lock);
-                       radix_tree_preload_end();
-                       exists = NULL;
+               exists = find_extent_buffer(tree, start);
+               if (exists)
+                       goto free_eb;
+               else
                        goto again;
-               }
-               spin_unlock(&tree->buffer_lock);
-               radix_tree_preload_end();
-               mark_extent_buffer_accessed(exists);
-               goto free_eb;
        }
        /* add one reference for the tree */
        check_buffer_tree_ref(eb);
-       spin_unlock(&tree->buffer_lock);
-       radix_tree_preload_end();
 
        /*
         * there is a race where release page may have
@@ -4582,23 +4615,6 @@ free_eb:
        return exists;
 }
 
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-                                        u64 start, unsigned long len)
-{
-       struct extent_buffer *eb;
-
-       rcu_read_lock();
-       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-       if (eb && atomic_inc_not_zero(&eb->refs)) {
-               rcu_read_unlock();
-               mark_extent_buffer_accessed(eb);
-               return eb;
-       }
-       rcu_read_unlock();
-
-       return NULL;
-}
-
 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 {
        struct extent_buffer *eb =
@@ -5062,23 +5078,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
        }
 }
 
-static void move_pages(struct page *dst_page, struct page *src_page,
-                      unsigned long dst_off, unsigned long src_off,
-                      unsigned long len)
-{
-       char *dst_kaddr = page_address(dst_page);
-       if (dst_page == src_page) {
-               memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
-       } else {
-               char *src_kaddr = page_address(src_page);
-               char *p = dst_kaddr + dst_off + len;
-               char *s = src_kaddr + src_off + len;
-
-               while (len--)
-                       *--p = *--s;
-       }
-}
-
 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
 {
        unsigned long distance = (src > dst) ? src - dst : dst - src;
@@ -5189,7 +5188,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 
                cur = min_t(unsigned long, len, src_off_in_page + 1);
                cur = min(cur, dst_off_in_page + 1);
-               move_pages(extent_buffer_page(dst, dst_i),
+               copy_pages(extent_buffer_page(dst, dst_i),
                           extent_buffer_page(dst, src_i),
                           dst_off_in_page - cur + 1,
                           src_off_in_page - cur + 1, cur);
index 6dbc645..19620c5 100644 (file)
@@ -271,7 +271,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-                                        u64 start, unsigned long len);
+                                        u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE      0
@@ -345,4 +345,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                         int mirror_num);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+noinline u64 find_lock_delalloc_range(struct inode *inode,
+                                     struct extent_io_tree *tree,
+                                     struct page *locked_page, u64 *start,
+                                     u64 *end, u64 max_bytes);
+#endif
 #endif
index 61adc44..93fba71 100644 (file)
@@ -3,10 +3,10 @@
 
 #include <linux/rbtree.h>
 
-#define EXTENT_MAP_LAST_BYTE (u64)-4
-#define EXTENT_MAP_HOLE (u64)-3
-#define EXTENT_MAP_INLINE (u64)-2
-#define EXTENT_MAP_DELALLOC (u64)-1
+#define EXTENT_MAP_LAST_BYTE ((u64)-4)
+#define EXTENT_MAP_HOLE ((u64)-3)
+#define EXTENT_MAP_INLINE ((u64)-2)
+#define EXTENT_MAP_DELALLOC ((u64)-1)
 
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
index 4f53159..6f38488 100644 (file)
@@ -329,6 +329,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        u64 csum_end;
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
+       ASSERT(start == ALIGN(start, root->sectorsize) &&
+              (end + 1) == ALIGN(end + 1, root->sectorsize));
+
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -846,10 +849,8 @@ insert:
        path->leave_spinning = 0;
        if (ret < 0)
                goto fail_unlock;
-       if (ret != 0) {
-               WARN_ON(1);
+       if (WARN_ON(ret != 0))
                goto fail_unlock;
-       }
        leaf = path->nodes[0];
 csum:
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
index 72da4df..82d0342 100644 (file)
@@ -39,7 +39,6 @@
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
-#include "compat.h"
 #include "volumes.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
@@ -370,7 +369,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
        u64 root_objectid = 0;
 
        atomic_inc(&fs_info->defrag_running);
-       while(1) {
+       while (1) {
                /* Pause the auto defragger. */
                if (test_bit(BTRFS_FS_STATE_REMOUNTING,
                             &fs_info->fs_state))
@@ -1281,6 +1280,7 @@ again:
                }
                wait_on_page_writeback(pages[i]);
        }
+       faili = num_pages - 1;
        err = 0;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
@@ -1299,8 +1299,10 @@ again:
                                unlock_page(pages[i]);
                                page_cache_release(pages[i]);
                        }
-                       btrfs_wait_ordered_range(inode, start_pos,
-                                                last_pos - start_pos);
+                       err = btrfs_wait_ordered_range(inode, start_pos,
+                                                      last_pos - start_pos);
+                       if (err)
+                               goto fail;
                        goto again;
                }
                if (ordered)
@@ -1809,8 +1811,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        atomic_inc(&root->log_batch);
        full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                             &BTRFS_I(inode)->runtime_flags);
-       if (full_sync)
-               btrfs_wait_ordered_range(inode, start, end - start + 1);
+       if (full_sync) {
+               ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       goto out;
+               }
+       }
        atomic_inc(&root->log_batch);
 
        /*
@@ -1876,27 +1883,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        mutex_unlock(&inode->i_mutex);
 
        if (ret != BTRFS_NO_LOG_SYNC) {
-               if (ret > 0) {
-                       /*
-                        * If we didn't already wait for ordered extents we need
-                        * to do that now.
-                        */
-                       if (!full_sync)
-                               btrfs_wait_ordered_range(inode, start,
-                                                        end - start + 1);
-                       ret = btrfs_commit_transaction(trans, root);
-               } else {
+               if (!ret) {
                        ret = btrfs_sync_log(trans, root);
-                       if (ret == 0) {
+                       if (!ret) {
                                ret = btrfs_end_transaction(trans, root);
-                       } else {
-                               if (!full_sync)
-                                       btrfs_wait_ordered_range(inode, start,
-                                                                end -
-                                                                start + 1);
-                               ret = btrfs_commit_transaction(trans, root);
+                               goto out;
                        }
                }
+               if (!full_sync) {
+                       ret = btrfs_wait_ordered_range(inode, start,
+                                                      end - start + 1);
+                       if (ret)
+                               goto out;
+               }
+               ret = btrfs_commit_transaction(trans, root);
        } else {
                ret = btrfs_end_transaction(trans, root);
        }
@@ -2067,7 +2067,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 
-       btrfs_wait_ordered_range(inode, offset, len);
+       ret = btrfs_wait_ordered_range(inode, offset, len);
+       if (ret)
+               return ret;
 
        mutex_lock(&inode->i_mutex);
        /*
@@ -2136,8 +2138,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                        btrfs_put_ordered_extent(ordered);
                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
                                     lockend, &cached_state, GFP_NOFS);
-               btrfs_wait_ordered_range(inode, lockstart,
-                                        lockend - lockstart + 1);
+               ret = btrfs_wait_ordered_range(inode, lockstart,
+                                              lockend - lockstart + 1);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
        }
 
        path = btrfs_alloc_path();
@@ -2308,7 +2314,10 @@ static long btrfs_fallocate(struct file *file, int mode,
         * wait for ordered IO before we have any locks.  We'll loop again
         * below with the locks held.
         */
-       btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+       ret = btrfs_wait_ordered_range(inode, alloc_start,
+                                      alloc_end - alloc_start);
+       if (ret)
+               goto out;
 
        locked_end = alloc_end - 1;
        while (1) {
@@ -2332,8 +2341,10 @@ static long btrfs_fallocate(struct file *file, int mode,
                         * we can't wait on the range with the transaction
                         * running or with the extent lock held
                         */
-                       btrfs_wait_ordered_range(inode, alloc_start,
-                                                alloc_end - alloc_start);
+                       ret = btrfs_wait_ordered_range(inode, alloc_start,
+                                                      alloc_end - alloc_start);
+                       if (ret)
+                               goto out;
                } else {
                        if (ordered)
                                btrfs_put_ordered_extent(ordered);
@@ -2405,14 +2416,12 @@ out_reserve_fail:
 static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_map *em;
+       struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 lockstart = *offset;
        u64 lockend = i_size_read(inode);
        u64 start = *offset;
-       u64 orig_start = *offset;
        u64 len = i_size_read(inode);
-       u64 last_end = 0;
        int ret = 0;
 
        lockend = max_t(u64, root->sectorsize, lockend);
@@ -2429,89 +2438,35 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
        lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
                         &cached_state);
 
-       /*
-        * Delalloc is such a pain.  If we have a hole and we have pending
-        * delalloc for a portion of the hole we will get back a hole that
-        * exists for the entire range since it hasn't been actually written
-        * yet.  So to take care of this case we need to look for an extent just
-        * before the position we want in case there is outstanding delalloc
-        * going on here.
-        */
-       if (whence == SEEK_HOLE && start != 0) {
-               if (start <= root->sectorsize)
-                       em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
-                                                    root->sectorsize, 0);
-               else
-                       em = btrfs_get_extent_fiemap(inode, NULL, 0,
-                                                    start - root->sectorsize,
-                                                    root->sectorsize, 0);
-               if (IS_ERR(em)) {
-                       ret = PTR_ERR(em);
-                       goto out;
-               }
-               last_end = em->start + em->len;
-               if (em->block_start == EXTENT_MAP_DELALLOC)
-                       last_end = min_t(u64, last_end, inode->i_size);
-               free_extent_map(em);
-       }
-
-       while (1) {
+       while (start < inode->i_size) {
                em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
                if (IS_ERR(em)) {
                        ret = PTR_ERR(em);
+                       em = NULL;
                        break;
                }
 
-               if (em->block_start == EXTENT_MAP_HOLE) {
-                       if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
-                               if (last_end <= orig_start) {
-                                       free_extent_map(em);
-                                       ret = -ENXIO;
-                                       break;
-                               }
-                       }
-
-                       if (whence == SEEK_HOLE) {
-                               *offset = start;
-                               free_extent_map(em);
-                               break;
-                       }
-               } else {
-                       if (whence == SEEK_DATA) {
-                               if (em->block_start == EXTENT_MAP_DELALLOC) {
-                                       if (start >= inode->i_size) {
-                                               free_extent_map(em);
-                                               ret = -ENXIO;
-                                               break;
-                                       }
-                               }
-
-                               if (!test_bit(EXTENT_FLAG_PREALLOC,
-                                             &em->flags)) {
-                                       *offset = start;
-                                       free_extent_map(em);
-                                       break;
-                               }
-                       }
-               }
+               if (whence == SEEK_HOLE &&
+                   (em->block_start == EXTENT_MAP_HOLE ||
+                    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+                       break;
+               else if (whence == SEEK_DATA &&
+                          (em->block_start != EXTENT_MAP_HOLE &&
+                           !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+                       break;
 
                start = em->start + em->len;
-               last_end = em->start + em->len;
-
-               if (em->block_start == EXTENT_MAP_DELALLOC)
-                       last_end = min_t(u64, last_end, inode->i_size);
-
-               if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
-                       free_extent_map(em);
-                       ret = -ENXIO;
-                       break;
-               }
                free_extent_map(em);
+               em = NULL;
                cond_resched();
        }
-       if (!ret)
-               *offset = min(*offset, inode->i_size);
-out:
+       free_extent_map(em);
+       if (!ret) {
+               if (whence == SEEK_DATA && start >= inode->i_size)
+                       ret = -ENXIO;
+               else
+                       *offset = min_t(loff_t, start, inode->i_size);
+       }
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state, GFP_NOFS);
        return ret;
index b4f9904..057be95 100644 (file)
@@ -218,7 +218,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct btrfs_trans_handle *trans,
-                                   struct btrfs_path *path,
                                    struct inode *inode)
 {
        int ret = 0;
@@ -1009,8 +1008,13 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        if (ret)
                goto out;
 
-
-       btrfs_wait_ordered_range(inode, 0, (u64)-1);
+       ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+       if (ret) {
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+                                EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+                                GFP_NOFS);
+               goto out;
+       }
 
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
@@ -2276,7 +2280,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                goto out;
 
        entry = rb_entry(node, struct btrfs_free_space, offset_index);
-       while(1) {
+       while (1) {
                if (entry->bytes < bytes && entry->bytes > *max_extent_size)
                        *max_extent_size = entry->bytes;
 
@@ -2967,19 +2971,15 @@ out:
 
 int btrfs_write_out_ino_cache(struct btrfs_root *root,
                              struct btrfs_trans_handle *trans,
-                             struct btrfs_path *path)
+                             struct btrfs_path *path,
+                             struct inode *inode)
 {
        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
-       struct inode *inode;
        int ret;
 
        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
                return 0;
 
-       inode = lookup_free_ino_inode(root, path);
-       if (IS_ERR(inode))
-               return 0;
-
        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
        if (ret) {
                btrfs_delalloc_release_metadata(inode, inode->i_size);
@@ -2990,7 +2990,6 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 #endif
        }
 
-       iput(inode);
        return ret;
 }
 
index e737f92..0cf4977 100644 (file)
@@ -58,7 +58,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
                                       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct btrfs_trans_handle *trans,
-                                   struct btrfs_path *path,
                                    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
                          struct btrfs_block_group_cache *block_group);
@@ -76,7 +75,8 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info,
                        struct btrfs_root *root);
 int btrfs_write_out_ino_cache(struct btrfs_root *root,
                              struct btrfs_trans_handle *trans,
-                             struct btrfs_path *path);
+                             struct btrfs_path *path,
+                             struct inode *inode);
 
 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
index e0b7034..ec82fae 100644 (file)
@@ -369,7 +369,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
                goto out;
 
        leaf = path->nodes[0];
-       item = btrfs_item_nr(leaf, path->slots[0]);
+       item = btrfs_item_nr(path->slots[0]);
        ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
        ptr += btrfs_item_size(leaf, item) - ins_len;
        extref = (struct btrfs_inode_extref *)ptr;
index 2c66ddb..ab485e5 100644 (file)
@@ -78,10 +78,8 @@ again:
                            btrfs_transaction_in_commit(fs_info)) {
                                leaf = path->nodes[0];
 
-                               if (btrfs_header_nritems(leaf) == 0) {
-                                       WARN_ON(1);
+                               if (WARN_ON(btrfs_header_nritems(leaf) == 0))
                                        break;
-                               }
 
                                /*
                                 * Save the key so we can advances forward
@@ -237,7 +235,7 @@ again:
                start_caching(root);
 
                if (objectid <= root->cache_progress ||
-                   objectid > root->highest_objectid)
+                   objectid >= root->highest_objectid)
                        __btrfs_add_free_space(ctl, objectid, 1);
                else
                        __btrfs_add_free_space(pinned, objectid, 1);
@@ -412,8 +410,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
                return 0;
 
        /* Don't save inode cache if we are deleting this root */
-       if (btrfs_root_refs(&root->root_item) == 0 &&
-           root != root->fs_info->tree_root)
+       if (btrfs_root_refs(&root->root_item) == 0)
                return 0;
 
        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
@@ -467,7 +464,7 @@ again:
        }
 
        if (i_size_read(inode) > 0) {
-               ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+               ret = btrfs_truncate_free_space_cache(root, trans, inode);
                if (ret) {
                        if (ret != -ENOSPC)
                                btrfs_abort_transaction(trans, root, ret);
@@ -504,7 +501,7 @@ again:
        }
        btrfs_free_reserved_data_space(inode, prealloc);
 
-       ret = btrfs_write_out_ino_cache(root, trans, path);
+       ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
        iput(inode);
 out_release:
index 51e3afa..da8d2f6 100644 (file)
@@ -43,7 +43,6 @@
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -844,7 +843,10 @@ static noinline int cow_file_range(struct inode *inode,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 0;
 
-       BUG_ON(btrfs_is_free_space_inode(inode));
+       if (btrfs_is_free_space_inode(inode)) {
+               WARN_ON_ONCE(1);
+               return -EINVAL;
+       }
 
        num_bytes = ALIGN(end - start + 1, blocksize);
        num_bytes = max(blocksize,  num_bytes);
@@ -1178,10 +1180,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        while (1) {
                ret = btrfs_lookup_file_extent(trans, root, path, ino,
                                               cur_offset, 0);
-               if (ret < 0) {
-                       btrfs_abort_transaction(trans, root, ret);
+               if (ret < 0)
                        goto error;
-               }
                if (ret > 0 && path->slots[0] > 0 && check_prev) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1195,10 +1195,8 @@ next_slot:
                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
-                       if (ret < 0) {
-                               btrfs_abort_transaction(trans, root, ret);
+                       if (ret < 0)
                                goto error;
-                       }
                        if (ret > 0)
                                break;
                        leaf = path->nodes[0];
@@ -1289,10 +1287,8 @@ out_check:
                        ret = cow_file_range(inode, locked_page,
                                             cow_start, found_key.offset - 1,
                                             page_started, nr_written, 1);
-                       if (ret) {
-                               btrfs_abort_transaction(trans, root, ret);
+                       if (ret)
                                goto error;
-                       }
                        cow_start = (u64)-1;
                }
 
@@ -1339,10 +1335,8 @@ out_check:
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
                                                      num_bytes);
-                       if (ret) {
-                               btrfs_abort_transaction(trans, root, ret);
+                       if (ret)
                                goto error;
-                       }
                }
 
                extent_clear_unlock_delalloc(inode, cur_offset,
@@ -1364,10 +1358,8 @@ out_check:
        if (cow_start != (u64)-1) {
                ret = cow_file_range(inode, locked_page, cow_start, end,
                                     page_started, nr_written, 1);
-               if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+               if (ret)
                        goto error;
-               }
        }
 
 error:
@@ -1551,7 +1543,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
 
-               if (*bits & EXTENT_DO_ACCOUNTING)
+               /*
+                * We don't reserve metadata space for space cache inodes so we
+                * don't need to call dellalloc_release_metadata if there is an
+                * error.
+                */
+               if (*bits & EXTENT_DO_ACCOUNTING &&
+                   root != root->fs_info->tree_root)
                        btrfs_delalloc_release_metadata(inode, len);
 
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
@@ -2041,10 +2039,8 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
                key.offset = offset;
 
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-       if (ret < 0) {
-               WARN_ON(1);
+       if (WARN_ON(ret < 0))
                return ret;
-       }
        ret = 0;
 
        while (1) {
@@ -2367,10 +2363,23 @@ out_unlock:
        return ret;
 }
 
+static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
+{
+       struct old_sa_defrag_extent *old, *tmp;
+
+       if (!new)
+               return;
+
+       list_for_each_entry_safe(old, tmp, &new->head, list) {
+               list_del(&old->list);
+               kfree(old);
+       }
+       kfree(new);
+}
+
 static void relink_file_extents(struct new_sa_defrag_extent *new)
 {
        struct btrfs_path *path;
-       struct old_sa_defrag_extent *old, *tmp;
        struct sa_defrag_extent_backref *backref;
        struct sa_defrag_extent_backref *prev = NULL;
        struct inode *inode;
@@ -2413,16 +2422,11 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
        kfree(prev);
 
        btrfs_free_path(path);
-
-       list_for_each_entry_safe(old, tmp, &new->head, list) {
-               list_del(&old->list);
-               kfree(old);
-       }
 out:
+       free_sa_defrag_extent(new);
+
        atomic_dec(&root->fs_info->defrag_running);
        wake_up(&root->fs_info->transaction_wait);
-
-       kfree(new);
 }
 
 static struct new_sa_defrag_extent *
@@ -2432,7 +2436,7 @@ record_old_file_extents(struct inode *inode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
        struct btrfs_key key;
-       struct old_sa_defrag_extent *old, *tmp;
+       struct old_sa_defrag_extent *old;
        struct new_sa_defrag_extent *new;
        int ret;
 
@@ -2480,7 +2484,7 @@ record_old_file_extents(struct inode *inode,
                if (slot >= btrfs_header_nritems(l)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
-                               goto out_free_list;
+                               goto out_free_path;
                        else if (ret > 0)
                                break;
                        continue;
@@ -2509,7 +2513,7 @@ record_old_file_extents(struct inode *inode,
 
                old = kmalloc(sizeof(*old), GFP_NOFS);
                if (!old)
-                       goto out_free_list;
+                       goto out_free_path;
 
                offset = max(new->file_pos, key.offset);
                end = min(new->file_pos + new->len, key.offset + num_bytes);
@@ -2531,15 +2535,10 @@ next:
 
        return new;
 
-out_free_list:
-       list_for_each_entry_safe(old, tmp, &new->head, list) {
-               list_del(&old->list);
-               kfree(old);
-       }
 out_free_path:
        btrfs_free_path(path);
 out_kfree:
-       kfree(new);
+       free_sa_defrag_extent(new);
        return NULL;
 }
 
@@ -2710,8 +2709,14 @@ out:
        btrfs_remove_ordered_extent(inode, ordered_extent);
 
        /* for snapshot-aware defrag */
-       if (new)
-               relink_file_extents(new);
+       if (new) {
+               if (ret) {
+                       free_sa_defrag_extent(new);
+                       atomic_dec(&root->fs_info->defrag_running);
+               } else {
+                       relink_file_extents(new);
+               }
+       }
 
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
@@ -2969,6 +2974,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        if (insert >= 1) {
                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
                if (ret) {
+                       atomic_dec(&root->orphan_inodes);
                        if (reserve) {
                                clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
                                          &BTRFS_I(inode)->runtime_flags);
@@ -3018,14 +3024,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
                release_rsv = 1;
        spin_unlock(&root->orphan_lock);
 
-       if (trans && delete_item)
-               ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-
-       if (release_rsv) {
-               btrfs_orphan_release_metadata(inode);
+       if (delete_item) {
                atomic_dec(&root->orphan_inodes);
+               if (trans)
+                       ret = btrfs_del_orphan_item(trans, root,
+                                                   btrfs_ino(inode));
        }
 
+       if (release_rsv)
+               btrfs_orphan_release_metadata(inode);
+
        return ret;
 }
 
@@ -3172,8 +3180,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
-                       if (!S_ISREG(inode->i_mode)) {
-                               WARN_ON(1);
+                       if (WARN_ON(!S_ISREG(inode->i_mode))) {
                                iput(inode);
                                continue;
                        }
@@ -3636,7 +3643,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        int ret;
        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
        if (!ret) {
-               btrfs_drop_nlink(inode);
+               drop_nlink(inode);
                ret = btrfs_update_inode(trans, root, inode);
        }
        return ret;
@@ -4230,15 +4237,16 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
        while (1) {
                struct btrfs_ordered_extent *ordered;
-               btrfs_wait_ordered_range(inode, hole_start,
-                                        block_end - hole_start);
+
                lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
                                 &cached_state);
-               ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+               ordered = btrfs_lookup_ordered_range(inode, hole_start,
+                                                    block_end - hole_start);
                if (!ordered)
                        break;
                unlock_extent_cached(io_tree, hole_start, block_end - 1,
                                     &cached_state, GFP_NOFS);
+               btrfs_start_ordered_extent(inode, ordered, 1);
                btrfs_put_ordered_extent(ordered);
        }
 
@@ -4472,8 +4480,10 @@ void btrfs_evict_inode(struct inode *inode)
        trace_btrfs_inode_evict(inode);
 
        truncate_inode_pages(&inode->i_data, 0);
-       if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
-                              btrfs_is_free_space_inode(inode)))
+       if (inode->i_nlink &&
+           ((btrfs_root_refs(&root->root_item) != 0 &&
+             root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+            btrfs_is_free_space_inode(inode)))
                goto no_delete;
 
        if (is_bad_inode(inode)) {
@@ -4490,7 +4500,8 @@ void btrfs_evict_inode(struct inode *inode)
        }
 
        if (inode->i_nlink > 0) {
-               BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+               BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+                      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
                goto no_delete;
        }
 
@@ -4731,14 +4742,7 @@ static void inode_tree_del(struct inode *inode)
        }
        spin_unlock(&root->inode_lock);
 
-       /*
-        * Free space cache has inodes in the tree root, but the tree root has a
-        * root_refs of 0, so this could end up dropping the tree root as a
-        * snapshot, so we need the extra !root->fs_info->tree_root check to
-        * make sure we don't drop it.
-        */
-       if (empty && btrfs_root_refs(&root->root_item) == 0 &&
-           root != root->fs_info->tree_root) {
+       if (empty && btrfs_root_refs(&root->root_item) == 0) {
                synchronize_srcu(&root->fs_info->subvol_srcu);
                spin_lock(&root->inode_lock);
                empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -4831,10 +4835,12 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
 {
        struct inode *inode;
        struct btrfs_iget_args args;
+       unsigned long hashval = btrfs_inode_hash(objectid, root);
+
        args.ino = objectid;
        args.root = root;
 
-       inode = iget5_locked(s, objectid, btrfs_find_actor,
+       inode = iget5_locked(s, hashval, btrfs_find_actor,
                             btrfs_init_locked_inode,
                             (void *)&args);
        return inode;
@@ -5048,7 +5054,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                        continue;
                }
 
-               item = btrfs_item_nr(leaf, slot);
+               item = btrfs_item_nr(slot);
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
                if (found_key.objectid != key.objectid)
@@ -5454,7 +5460,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                BTRFS_INODE_NODATASUM;
        }
 
-       insert_inode_hash(inode);
+       btrfs_insert_inode_hash(inode);
        inode_tree_add(inode);
 
        trace_btrfs_inode_new(inode);
@@ -5730,7 +5736,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                goto fail;
        }
 
-       btrfs_inc_nlink(inode);
+       inc_nlink(inode);
        inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
        ihold(inode);
@@ -5860,7 +5866,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        compress_type = btrfs_file_extent_compression(leaf, item);
        max_size = btrfs_file_extent_ram_bytes(leaf, item);
        inline_size = btrfs_file_extent_inline_item_len(leaf,
-                                       btrfs_item_nr(leaf, path->slots[0]));
+                                       btrfs_item_nr(path->slots[0]));
        tmp = kmalloc(inline_size, GFP_NOFS);
        if (!tmp)
                return -ENOMEM;
@@ -5974,7 +5980,14 @@ again:
        found_type = btrfs_key_type(&found_key);
        if (found_key.objectid != objectid ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
-               goto not_found;
+               /*
+                * If we backup past the first extent we want to move forward
+                * and see if there is an extent in front of us, otherwise we'll
+                * say there is a hole for our whole search range which can
+                * cause problems.
+                */
+               extent_end = start;
+               goto next;
        }
 
        found_type = btrfs_file_extent_type(leaf, item);
@@ -5989,7 +6002,7 @@ again:
                size = btrfs_file_extent_inline_len(leaf, item);
                extent_end = ALIGN(extent_start + size, root->sectorsize);
        }
-
+next:
        if (start >= extent_end) {
                path->slots[0]++;
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -6249,7 +6262,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
        /* adjust the range_start to make sure it doesn't
         * go backwards from the start they passed in
         */
-       range_start = max(start,range_start);
+       range_start = max(start, range_start);
        found = found_end - range_start;
 
        if (found > 0) {
@@ -7053,7 +7066,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        }
                } else {
                        submit_len += bvec->bv_len;
-                       nr_pages ++;
+                       nr_pages++;
                        bvec++;
                }
        }
@@ -7222,7 +7235,9 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
         * outstanding dirty pages are on disk.
         */
        count = iov_length(iov, nr_segs);
-       btrfs_wait_ordered_range(inode, offset, count);
+       ret = btrfs_wait_ordered_range(inode, offset, count);
+       if (ret)
+               return ret;
 
        if (rw & WRITE) {
                /*
@@ -7563,7 +7578,10 @@ static int btrfs_truncate(struct inode *inode)
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-       btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+       ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+                                      (u64)-1);
+       if (ret)
+               return ret;
 
        /*
         * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
@@ -7787,6 +7805,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        return inode;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode)
+{
+       btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+       kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+#endif
+
 static void btrfs_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -7857,8 +7883,7 @@ int btrfs_drop_inode(struct inode *inode)
                return 1;
 
        /* the snap/subvol tree is on deleting */
-       if (btrfs_root_refs(&root->root_item) == 0 &&
-           root != root->fs_info->tree_root)
+       if (btrfs_root_refs(&root->root_item) == 0)
                return 1;
        else
                return generic_drop_inode(inode);
@@ -7995,8 +8020,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (ret == -EEXIST) {
                        /* we shouldn't get
                         * eexist without a new_inode */
-                       if (!new_inode) {
-                               WARN_ON(1);
+                       if (WARN_ON(!new_inode)) {
                                return ret;
                        }
                } else {
@@ -8144,18 +8168,24 @@ out_notrans:
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
        struct btrfs_delalloc_work *delalloc_work;
+       struct inode *inode;
 
        delalloc_work = container_of(work, struct btrfs_delalloc_work,
                                     work);
-       if (delalloc_work->wait)
-               btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
-       else
-               filemap_flush(delalloc_work->inode->i_mapping);
+       inode = delalloc_work->inode;
+       if (delalloc_work->wait) {
+               btrfs_wait_ordered_range(inode, 0, (u64)-1);
+       } else {
+               filemap_flush(inode->i_mapping);
+               if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                            &BTRFS_I(inode)->runtime_flags))
+                       filemap_flush(inode->i_mapping);
+       }
 
        if (delalloc_work->delay_iput)
-               btrfs_add_delayed_iput(delalloc_work->inode);
+               btrfs_add_delayed_iput(inode);
        else
-               iput(delalloc_work->inode);
+               iput(inode);
        complete(&delalloc_work->completion);
 }
 
@@ -8276,8 +8306,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return ret;
 }
 
-int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
-                                   int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 {
        struct btrfs_root *root;
        struct list_head splice;
@@ -8337,14 +8366,14 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        int err;
        int drop_inode = 0;
        u64 objectid;
-       u64 index = 0 ;
+       u64 index = 0;
        int name_len;
        int datasize;
        unsigned long ptr;
        struct btrfs_file_extent_item *ei;
        struct extent_buffer *leaf;
 
-       name_len = strlen(symname) + 1;
+       name_len = strlen(symname);
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
 
@@ -8432,7 +8461,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_mapping->a_ops = &btrfs_symlink_aops;
        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        inode_set_bytes(inode, name_len);
-       btrfs_i_size_write(inode, name_len - 1);
+       btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
        if (err)
                drop_inode = 1;
@@ -8491,6 +8520,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                                  ins.offset, 0, 0, 0,
                                                  BTRFS_FILE_EXTENT_PREALLOC);
                if (ret) {
+                       btrfs_free_reserved_extent(root, ins.objectid,
+                                                  ins.offset);
                        btrfs_abort_transaction(trans, root, ret);
                        if (own_trans)
                                btrfs_end_transaction(trans, root);
index 9d46f60..1d04b55 100644 (file)
@@ -44,7 +44,6 @@
 #include <linux/uuid.h>
 #include <linux/btrfs.h>
 #include <linux/uaccess.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -369,9 +368,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
 int btrfs_is_empty_uuid(u8 *uuid)
 {
-       static char empty_uuid[BTRFS_UUID_SIZE] = {0};
-
-       return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE);
+       BUILD_BUG_ON(BTRFS_UUID_SIZE > PAGE_SIZE);
+       return !memcmp(uuid, empty_zero_page, BTRFS_UUID_SIZE);
 }
 
 static noinline int create_subvol(struct inode *dir,
@@ -436,7 +434,7 @@ static noinline int create_subvol(struct inode *dir,
        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(leaf, objectid);
 
-       write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf),
+       write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
                            BTRFS_FSID_SIZE);
        write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
                            btrfs_header_chunk_tree_uuid(leaf),
@@ -574,7 +572,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (ret)
                return ret;
 
-       btrfs_wait_ordered_extents(root);
+       btrfs_wait_ordered_extents(root, -1);
 
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
        if (!pending_snapshot)
@@ -688,7 +686,7 @@ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
  *     nfs_async_unlink().
  */
 
-static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 {
        int error;
 
@@ -842,7 +840,6 @@ static int find_new_extents(struct btrfs_root *root,
 {
        struct btrfs_path *path;
        struct btrfs_key min_key;
-       struct btrfs_key max_key;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *extent;
        int type;
@@ -857,15 +854,10 @@ static int find_new_extents(struct btrfs_root *root,
        min_key.type = BTRFS_EXTENT_DATA_KEY;
        min_key.offset = *off;
 
-       max_key.objectid = ino;
-       max_key.type = (u8)-1;
-       max_key.offset = (u64)-1;
-
        path->keep_locks = 1;
 
-       while(1) {
-               ret = btrfs_search_forward(root, &min_key, &max_key,
-                                          path, newer_than);
+       while (1) {
+               ret = btrfs_search_forward(root, &min_key, path, newer_than);
                if (ret != 0)
                        goto none;
                if (min_key.objectid != ino)
@@ -1206,7 +1198,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                ra = &file->f_ra;
        }
 
-       pages = kmalloc(sizeof(struct page *) * max_cluster,
+       pages = kmalloc_array(max_cluster, sizeof(struct page *),
                        GFP_NOFS);
        if (!pages) {
                ret = -ENOMEM;
@@ -1893,7 +1885,6 @@ static noinline int search_ioctl(struct inode *inode,
 {
        struct btrfs_root *root;
        struct btrfs_key key;
-       struct btrfs_key max_key;
        struct btrfs_path *path;
        struct btrfs_ioctl_search_key *sk = &args->key;
        struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
@@ -1925,15 +1916,10 @@ static noinline int search_ioctl(struct inode *inode,
        key.type = sk->min_type;
        key.offset = sk->min_offset;
 
-       max_key.objectid = sk->max_objectid;
-       max_key.type = sk->max_type;
-       max_key.offset = sk->max_offset;
-
        path->keep_locks = 1;
 
-       while(1) {
-               ret = btrfs_search_forward(root, &key, &max_key, path,
-                                          sk->min_transid);
+       while (1) {
+               ret = btrfs_search_forward(root, &key, path, sk->min_transid);
                if (ret != 0) {
                        if (ret > 0)
                                ret = 0;
@@ -2018,7 +2004,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = (u64)-1;
 
-       while(1) {
+       while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
                        goto out;
@@ -2047,7 +2033,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
                }
 
                *(ptr + len) = '/';
-               read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+               read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
 
                if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
                        break;
@@ -2058,7 +2044,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
                dirid = key.objectid;
        }
        memmove(name, ptr, total_len);
-       name[total_len]='\0';
+       name[total_len] = '\0';
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -2144,7 +2130,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
        inode = dentry->d_inode;
        dest = BTRFS_I(inode)->root;
-       if (!capable(CAP_SYS_ADMIN)){
+       if (!capable(CAP_SYS_ADMIN)) {
                /*
                 * Regular user.  Only allow this with a special mount
                 * option, when the user has write+exec access to the
@@ -2727,15 +2713,10 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
        size = sizeof(tmp) +
                tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
 
-       same = kmalloc(size, GFP_NOFS);
-       if (!same) {
-               ret = -EFAULT;
-               goto out;
-       }
+       same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size);
 
-       if (copy_from_user(same,
-                          (struct btrfs_ioctl_same_args __user *)argp, size)) {
-               ret = -EFAULT;
+       if (IS_ERR(same)) {
+               ret = PTR_ERR(same);
                goto out;
        }
 
@@ -3679,9 +3660,10 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
 
        switch (p->cmd) {
        case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
-               if (root->fs_info->sb->s_flags & MS_RDONLY)
-                       return -EROFS;
-
+               if (root->fs_info->sb->s_flags & MS_RDONLY) {
+                       ret = -EROFS;
+                       goto out;
+               }
                if (atomic_xchg(
                        &root->fs_info->mutually_exclusive_operation_running,
                        1)) {
@@ -3707,7 +3689,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
 
        if (copy_to_user(arg, p, sizeof(*p)))
                ret = -EFAULT;
-
+out:
        kfree(p);
        return ret;
 }
@@ -4557,9 +4539,15 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_logical_to_ino(root, argp);
        case BTRFS_IOC_SPACE_INFO:
                return btrfs_ioctl_space_info(root, argp);
-       case BTRFS_IOC_SYNC:
-               btrfs_sync_fs(file->f_dentry->d_sb, 1);
-               return 0;
+       case BTRFS_IOC_SYNC: {
+               int ret;
+
+               ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+               if (ret)
+                       return ret;
+               ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+               return ret;
+       }
        case BTRFS_IOC_START_SYNC:
                return btrfs_ioctl_start_sync(root, argp);
        case BTRFS_IOC_WAIT_SYNC:
index c702cb6..25a8f38 100644 (file)
@@ -537,7 +537,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
         */
        if (RB_EMPTY_ROOT(&tree->tree) &&
            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+               spin_lock(&root->fs_info->ordered_root_lock);
                list_del_init(&BTRFS_I(inode)->ordered_operations);
+               spin_unlock(&root->fs_info->ordered_root_lock);
        }
 
        if (!root->nr_ordered_extents) {
@@ -563,10 +565,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-void btrfs_wait_ordered_extents(struct btrfs_root *root)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 {
        struct list_head splice, works;
        struct btrfs_ordered_extent *ordered, *next;
+       int count = 0;
 
        INIT_LIST_HEAD(&splice);
        INIT_LIST_HEAD(&works);
@@ -574,7 +577,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
        mutex_lock(&root->fs_info->ordered_operations_mutex);
        spin_lock(&root->ordered_extent_lock);
        list_splice_init(&root->ordered_extents, &splice);
-       while (!list_empty(&splice)) {
+       while (!list_empty(&splice) && nr) {
                ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
                                           root_extent_list);
                list_move_tail(&ordered->root_extent_list,
@@ -589,7 +592,11 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
 
                cond_resched();
                spin_lock(&root->ordered_extent_lock);
+               if (nr != -1)
+                       nr--;
+               count++;
        }
+       list_splice_tail(&splice, &root->ordered_extents);
        spin_unlock(&root->ordered_extent_lock);
 
        list_for_each_entry_safe(ordered, next, &works, work_list) {
@@ -599,18 +606,21 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
                cond_resched();
        }
        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+       return count;
 }
 
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 {
        struct btrfs_root *root;
        struct list_head splice;
+       int done;
 
        INIT_LIST_HEAD(&splice);
 
        spin_lock(&fs_info->ordered_root_lock);
        list_splice_init(&fs_info->ordered_roots, &splice);
-       while (!list_empty(&splice)) {
+       while (!list_empty(&splice) && nr) {
                root = list_first_entry(&splice, struct btrfs_root,
                                        ordered_root);
                root = btrfs_grab_fs_root(root);
@@ -619,10 +629,14 @@ void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
                               &fs_info->ordered_roots);
                spin_unlock(&fs_info->ordered_root_lock);
 
-               btrfs_wait_ordered_extents(root);
+               done = btrfs_wait_ordered_extents(root, nr);
                btrfs_put_fs_root(root);
 
                spin_lock(&fs_info->ordered_root_lock);
+               if (nr != -1) {
+                       nr -= done;
+                       WARN_ON(nr < 0);
+               }
        }
        spin_unlock(&fs_info->ordered_root_lock);
 }
@@ -734,8 +748,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
 /*
  * Used to wait on ordered extents across a large range of bytes.
  */
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
+       int ret = 0;
        u64 end;
        u64 orig_end;
        struct btrfs_ordered_extent *ordered;
@@ -751,8 +766,9 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-       filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
+       ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+       if (ret)
+               return ret;
        /*
         * So with compression we will find and lock a dirty page and clear the
         * first one as dirty, setup an async extent, and immediately return
@@ -768,10 +784,15 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
         * right and you are wrong.
         */
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                    &BTRFS_I(inode)->runtime_flags))
-               filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-       filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+                    &BTRFS_I(inode)->runtime_flags)) {
+               ret = filemap_fdatawrite_range(inode->i_mapping, start,
+                                              orig_end);
+               if (ret)
+                       return ret;
+       }
+       ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+       if (ret)
+               return ret;
 
        end = orig_end;
        while (1) {
@@ -788,11 +809,14 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                }
                btrfs_start_ordered_extent(inode, ordered, 1);
                end = ordered->file_offset;
+               if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+                       ret = -EIO;
                btrfs_put_ordered_extent(ordered);
-               if (end == 0 || end == start)
+               if (ret || end == 0 || end == start)
                        break;
                end--;
        }
+       return ret;
 }
 
 /*
@@ -1076,7 +1100,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
         * if this file hasn't been changed since the last transaction
         * commit, we can safely return without doing anything
         */
-       if (last_mod < root->fs_info->last_trans_committed)
+       if (last_mod <= root->fs_info->last_trans_committed)
                return;
 
        spin_lock(&root->fs_info->ordered_root_lock);
index 0c0b356..9b0450f 100644 (file)
@@ -180,7 +180,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
                                                         u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
                                struct btrfs_ordered_extent *entry, int wait);
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
@@ -195,8 +195,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root);
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
index 0088bed..417053b 100644 (file)
@@ -193,7 +193,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
        btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
                   btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
        for (i = 0 ; i < nr ; i++) {
-               item = btrfs_item_nr(l, i);
+               item = btrfs_item_nr(i);
                btrfs_item_key_to_cpu(l, &key, i);
                type = btrfs_key_type(&key);
                printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
index d0ecfbd..24ac218 100644 (file)
@@ -33,7 +33,6 @@
 #include <linux/raid/xor.h>
 #include <linux/vmalloc.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
index 4a35572..ce459a7 100644 (file)
@@ -1383,6 +1383,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 {
        struct btrfs_root *reloc_root;
        struct reloc_control *rc = root->fs_info->reloc_ctl;
+       struct btrfs_block_rsv *rsv;
        int clear_rsv = 0;
        int ret;
 
@@ -1396,13 +1397,14 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
                return 0;
 
-       if (!trans->block_rsv) {
+       if (!trans->reloc_reserved) {
+               rsv = trans->block_rsv;
                trans->block_rsv = rc->block_rsv;
                clear_rsv = 1;
        }
        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
        if (clear_rsv)
-               trans->block_rsv = NULL;
+               trans->block_rsv = rsv;
 
        ret = __add_reloc_root(reloc_root);
        BUG_ON(ret < 0);
@@ -1775,8 +1777,7 @@ again:
                        new_ptr_gen = 0;
                }
 
-               if (new_bytenr > 0 && new_bytenr == old_bytenr) {
-                       WARN_ON(1);
+               if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
                        ret = level;
                        break;
                }
@@ -2058,7 +2059,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        LIST_HEAD(inode_list);
        struct btrfs_key key;
        struct btrfs_key next_key;
-       struct btrfs_trans_handle *trans;
+       struct btrfs_trans_handle *trans = NULL;
        struct btrfs_root *reloc_root;
        struct btrfs_root_item *root_item;
        struct btrfs_path *path;
@@ -2107,18 +2108,19 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        memset(&next_key, 0, sizeof(next_key));
 
        while (1) {
-               trans = btrfs_start_transaction(root, 0);
-               BUG_ON(IS_ERR(trans));
-               trans->block_rsv = rc->block_rsv;
-
                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
                                             BTRFS_RESERVE_FLUSH_ALL);
                if (ret) {
-                       BUG_ON(ret != -EAGAIN);
-                       ret = btrfs_commit_transaction(trans, root);
-                       BUG_ON(ret);
-                       continue;
+                       err = ret;
+                       goto out;
                }
+               trans = btrfs_start_transaction(root, 0);
+               if (IS_ERR(trans)) {
+                       err = PTR_ERR(trans);
+                       trans = NULL;
+                       goto out;
+               }
+               trans->block_rsv = rc->block_rsv;
 
                replaced = 0;
                max_level = level;
@@ -2164,6 +2166,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                root_item->drop_level = level;
 
                btrfs_end_transaction_throttle(trans, root);
+               trans = NULL;
 
                btrfs_btree_balance_dirty(root);
 
@@ -2192,7 +2195,8 @@ out:
                btrfs_update_reloc_root(trans, root);
        }
 
-       btrfs_end_transaction_throttle(trans, root);
+       if (trans)
+               btrfs_end_transaction_throttle(trans, root);
 
        btrfs_btree_balance_dirty(root);
 
@@ -3258,7 +3262,7 @@ static int add_tree_block(struct reloc_control *rc,
        struct rb_node *rb_node;
        u32 item_size;
        int level = -1;
-       int generation;
+       u64 generation;
 
        eb =  path->nodes[0];
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -3407,7 +3411,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
                                    struct inode *inode, u64 ino)
 {
        struct btrfs_key key;
-       struct btrfs_path *path;
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_trans_handle *trans;
        int ret = 0;
@@ -3432,22 +3435,14 @@ truncate:
        if (ret)
                goto out;
 
-       path = btrfs_alloc_path();
-       if (!path) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
-               btrfs_free_path(path);
                ret = PTR_ERR(trans);
                goto out;
        }
 
-       ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+       ret = btrfs_truncate_free_space_cache(root, trans, inode);
 
-       btrfs_free_path(path);
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
 out:
@@ -3549,10 +3544,8 @@ static int find_data_references(struct reloc_control *rc,
                                err = ret;
                                goto out;
                        }
-                       if (ret > 0) {
-                               WARN_ON(1);
+                       if (WARN_ON(ret > 0))
                                goto out;
-                       }
 
                        leaf = path->nodes[0];
                        nritems = btrfs_header_nritems(leaf);
@@ -3572,11 +3565,9 @@ static int find_data_references(struct reloc_control *rc,
                }
 
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               if (key.objectid != ref_objectid ||
-                   key.type != BTRFS_EXTENT_DATA_KEY) {
-                       WARN_ON(1);
+               if (WARN_ON(key.objectid != ref_objectid ||
+                   key.type != BTRFS_EXTENT_DATA_KEY))
                        break;
-               }
 
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
@@ -4001,16 +3992,6 @@ restart:
                        }
                }
 
-               ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
-               if (ret < 0) {
-                       if (ret != -ENOSPC) {
-                               err = ret;
-                               WARN_ON(1);
-                               break;
-                       }
-                       rc->commit_transaction = 1;
-               }
-
                if (rc->commit_transaction) {
                        rc->commit_transaction = 0;
                        ret = btrfs_commit_transaction(trans, rc->extent_root);
@@ -4241,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
               rc->block_group->key.objectid, rc->block_group->flags);
 
-       ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
+       ret = btrfs_start_delalloc_roots(fs_info, 0);
        if (ret < 0) {
                err = ret;
                goto out;
        }
-       btrfs_wait_all_ordered_extents(fs_info);
+       btrfs_wait_ordered_roots(fs_info, -1);
 
        while (1) {
                mutex_lock(&fs_info->cleaner_mutex);
@@ -4264,7 +4245,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                        rc->extents_found);
 
                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
-                       btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+                       ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+                                                      (u64)-1);
+                       if (ret) {
+                               err = ret;
+                               goto out;
+                       }
                        invalidate_mapping_pages(rc->data_inode->i_mapping,
                                                 0, -1);
                        rc->stage = UPDATE_DATA_PTRS;
@@ -4481,6 +4467,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
        u64 disk_bytenr;
+       u64 new_bytenr;
        LIST_HEAD(list);
 
        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
@@ -4492,13 +4479,24 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        if (ret)
                goto out;
 
-       disk_bytenr = ordered->start;
        while (!list_empty(&list)) {
                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
                list_del_init(&sums->list);
 
-               sums->bytenr = disk_bytenr;
-               disk_bytenr += sums->len;
+               /*
+                * We need to offset the new_bytenr based on where the csum is.
+                * We need to do this because we will read in entire prealloc
+                * extents but we may have written to say the middle of the
+                * prealloc extent, so we need to make sure the csum goes with
+                * the right disk offset.
+                *
+                * We can do this because the data reloc inode refers strictly
+                * to the on disk bytes, so we don't have to worry about
+                * disk_len vs real len like with real inodes since it's all
+                * disk length.
+                */
+               new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+               sums->bytenr = new_bytenr;
 
                btrfs_add_ordered_sum(inode, ordered, sums);
        }
index a18e0e2..2544805 100644 (file)
@@ -2717,8 +2717,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                mutex_unlock(&fs_info->scrub_lock);
                wake_up(&fs_info->scrub_pause_wait);
 
-               dev_replace->cursor_left = dev_replace->cursor_right;
-               dev_replace->item_needs_writeback = 1;
                btrfs_put_block_group(cache);
                if (ret)
                        break;
@@ -2732,6 +2730,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                        break;
                }
 
+               dev_replace->cursor_left = dev_replace->cursor_right;
+               dev_replace->item_needs_writeback = 1;
+
                key.offset = found_key.offset + length;
                btrfs_release_path(path);
        }
@@ -2783,7 +2784,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 {
        int ret = 0;
 
-       mutex_lock(&fs_info->scrub_lock);
        if (fs_info->scrub_workers_refcnt == 0) {
                if (is_dev_replace)
                        btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
@@ -2813,21 +2813,17 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
        }
        ++fs_info->scrub_workers_refcnt;
 out:
-       mutex_unlock(&fs_info->scrub_lock);
-
        return ret;
 }
 
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
-       mutex_lock(&fs_info->scrub_lock);
        if (--fs_info->scrub_workers_refcnt == 0) {
                btrfs_stop_workers(&fs_info->scrub_workers);
                btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
                btrfs_stop_workers(&fs_info->scrub_nocow_workers);
        }
        WARN_ON(fs_info->scrub_workers_refcnt < 0);
-       mutex_unlock(&fs_info->scrub_lock);
 }
 
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -2888,23 +2884,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                return -EINVAL;
        }
 
-       ret = scrub_workers_get(fs_info, is_dev_replace);
-       if (ret)
-               return ret;
 
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
        if (!dev || (dev->missing && !is_dev_replace)) {
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-               scrub_workers_put(fs_info);
                return -ENODEV;
        }
-       mutex_lock(&fs_info->scrub_lock);
 
+       mutex_lock(&fs_info->scrub_lock);
        if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
                mutex_unlock(&fs_info->scrub_lock);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-               scrub_workers_put(fs_info);
                return -EIO;
        }
 
@@ -2915,10 +2906,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                btrfs_dev_replace_unlock(&fs_info->dev_replace);
                mutex_unlock(&fs_info->scrub_lock);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-               scrub_workers_put(fs_info);
                return -EINPROGRESS;
        }
        btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+       ret = scrub_workers_get(fs_info, is_dev_replace);
+       if (ret) {
+               mutex_unlock(&fs_info->scrub_lock);
+               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+               return ret;
+       }
+
        sctx = scrub_setup_ctx(dev, is_dev_replace);
        if (IS_ERR(sctx)) {
                mutex_unlock(&fs_info->scrub_lock);
@@ -2931,13 +2929,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
        atomic_inc(&fs_info->scrubs_running);
        mutex_unlock(&fs_info->scrub_lock);
-       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
        if (!is_dev_replace) {
-               down_read(&fs_info->scrub_super_lock);
+               /*
+                * by holding device list mutex, we can
+                * kick off writing super in log tree sync.
+                */
                ret = scrub_supers(sctx, dev);
-               up_read(&fs_info->scrub_super_lock);
        }
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
        if (!ret)
                ret = scrub_enumerate_chunks(sctx, dev, start, end,
@@ -2954,10 +2954,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
        mutex_lock(&fs_info->scrub_lock);
        dev->scrub_device = NULL;
+       scrub_workers_put(fs_info);
        mutex_unlock(&fs_info->scrub_lock);
 
        scrub_free_ctx(sctx);
-       scrub_workers_put(fs_info);
 
        return ret;
 }
@@ -2987,16 +2987,6 @@ void btrfs_scrub_continue(struct btrfs_root *root)
        wake_up(&fs_info->scrub_pause_wait);
 }
 
-void btrfs_scrub_pause_super(struct btrfs_root *root)
-{
-       down_write(&root->fs_info->scrub_super_lock);
-}
-
-void btrfs_scrub_continue_super(struct btrfs_root *root)
-{
-       up_write(&root->fs_info->scrub_super_lock);
-}
-
 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 {
        mutex_lock(&fs_info->scrub_lock);
index e46e0ed..6837fe8 100644 (file)
@@ -121,7 +121,6 @@ struct send_ctx {
        struct list_head name_cache_list;
        int name_cache_size;
 
-       struct file *cur_inode_filp;
        char *read_buf;
 };
 
@@ -565,10 +564,8 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
 {
        struct btrfs_cmd_header *hdr;
 
-       if (!sctx->send_buf) {
-               WARN_ON(1);
+       if (WARN_ON(!sctx->send_buf))
                return -EINVAL;
-       }
 
        BUG_ON(sctx->send_size);
 
@@ -791,7 +788,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
        if (found_key->type == BTRFS_INODE_REF_KEY) {
                ptr = (unsigned long)btrfs_item_ptr(eb, slot,
                                                    struct btrfs_inode_ref);
-               item = btrfs_item_nr(eb, slot);
+               item = btrfs_item_nr(slot);
                total = btrfs_item_size(eb, item);
                elem_size = sizeof(*iref);
        } else {
@@ -905,7 +902,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 
        eb = path->nodes[0];
        slot = path->slots[0];
-       item = btrfs_item_nr(eb, slot);
+       item = btrfs_item_nr(slot);
        di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
        cur = 0;
        len = 0;
@@ -2119,77 +2116,6 @@ out:
        return ret;
 }
 
-/*
- * Called for regular files when sending extents data. Opens a struct file
- * to read from the file.
- */
-static int open_cur_inode_file(struct send_ctx *sctx)
-{
-       int ret = 0;
-       struct btrfs_key key;
-       struct path path;
-       struct inode *inode;
-       struct dentry *dentry;
-       struct file *filp;
-       int new = 0;
-
-       if (sctx->cur_inode_filp)
-               goto out;
-
-       key.objectid = sctx->cur_ino;
-       key.type = BTRFS_INODE_ITEM_KEY;
-       key.offset = 0;
-
-       inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
-                       &new);
-       if (IS_ERR(inode)) {
-               ret = PTR_ERR(inode);
-               goto out;
-       }
-
-       dentry = d_obtain_alias(inode);
-       inode = NULL;
-       if (IS_ERR(dentry)) {
-               ret = PTR_ERR(dentry);
-               goto out;
-       }
-
-       path.mnt = sctx->mnt;
-       path.dentry = dentry;
-       filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred());
-       dput(dentry);
-       dentry = NULL;
-       if (IS_ERR(filp)) {
-               ret = PTR_ERR(filp);
-               goto out;
-       }
-       sctx->cur_inode_filp = filp;
-
-out:
-       /*
-        * no xxxput required here as every vfs op
-        * does it by itself on failure
-        */
-       return ret;
-}
-
-/*
- * Closes the struct file that was created in open_cur_inode_file
- */
-static int close_cur_inode_file(struct send_ctx *sctx)
-{
-       int ret = 0;
-
-       if (!sctx->cur_inode_filp)
-               goto out;
-
-       ret = filp_close(sctx->cur_inode_filp, NULL);
-       sctx->cur_inode_filp = NULL;
-
-out:
-       return ret;
-}
-
 /*
  * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
  */
@@ -3622,6 +3548,72 @@ out:
        return ret;
 }
 
+static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+{
+       struct btrfs_root *root = sctx->send_root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct inode *inode;
+       struct page *page;
+       char *addr;
+       struct btrfs_key key;
+       pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+       pgoff_t last_index;
+       unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+       ssize_t ret = 0;
+
+       key.objectid = sctx->cur_ino;
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+
+       inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       if (offset + len > i_size_read(inode)) {
+               if (offset > i_size_read(inode))
+                       len = 0;
+               else
+                       len = offset - i_size_read(inode);
+       }
+       if (len == 0)
+               goto out;
+
+       last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+       while (index <= last_index) {
+               unsigned cur_len = min_t(unsigned, len,
+                                        PAGE_CACHE_SIZE - pg_offset);
+               page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+               if (!page) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               if (!PageUptodate(page)) {
+                       btrfs_readpage(NULL, page);
+                       lock_page(page);
+                       if (!PageUptodate(page)) {
+                               unlock_page(page);
+                               page_cache_release(page);
+                               ret = -EIO;
+                               break;
+                       }
+               }
+
+               addr = kmap(page);
+               memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+               kunmap(page);
+               unlock_page(page);
+               page_cache_release(page);
+               index++;
+               pg_offset = 0;
+               len -= cur_len;
+               ret += cur_len;
+       }
+out:
+       iput(inode);
+       return ret;
+}
+
 /*
  * Read some bytes from the current inode/file and send a write command to
  * user space.
@@ -3630,35 +3622,20 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
 {
        int ret = 0;
        struct fs_path *p;
-       loff_t pos = offset;
-       int num_read = 0;
-       mm_segment_t old_fs;
+       ssize_t num_read = 0;
 
        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
-       /*
-        * vfs normally only accepts user space buffers for security reasons.
-        * we only read from the file and also only provide the read_buf buffer
-        * to vfs. As this buffer does not come from a user space call, it's
-        * ok to temporary allow kernel space buffers.
-        */
-       old_fs = get_fs();
-       set_fs(KERNEL_DS);
-
 verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 
-       ret = open_cur_inode_file(sctx);
-       if (ret < 0)
-               goto out;
-
-       ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
-       if (ret < 0)
-               goto out;
-       num_read = ret;
-       if (!num_read)
+       num_read = fill_read_buf(sctx, offset, len);
+       if (num_read <= 0) {
+               if (num_read < 0)
+                       ret = num_read;
                goto out;
+       }
 
        ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
        if (ret < 0)
@@ -3677,7 +3654,6 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 tlv_put_failure:
 out:
        fs_path_free(p);
-       set_fs(old_fs);
        if (ret < 0)
                return ret;
        return num_read;
@@ -3926,16 +3902,16 @@ static int is_extent_unchanged(struct send_ctx *sctx,
        while (key.offset < ekey->offset + left_len) {
                ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
                right_type = btrfs_file_extent_type(eb, ei);
-               right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
-               right_len = btrfs_file_extent_num_bytes(eb, ei);
-               right_offset = btrfs_file_extent_offset(eb, ei);
-               right_gen = btrfs_file_extent_generation(eb, ei);
-
                if (right_type != BTRFS_FILE_EXTENT_REG) {
                        ret = 0;
                        goto out;
                }
 
+               right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+               right_len = btrfs_file_extent_num_bytes(eb, ei);
+               right_offset = btrfs_file_extent_offset(eb, ei);
+               right_gen = btrfs_file_extent_generation(eb, ei);
+
                /*
                 * Are we at extent 8? If yes, we know the extent is changed.
                 * This may only happen on the first iteration.
@@ -4222,10 +4198,6 @@ static int changed_inode(struct send_ctx *sctx,
        u64 left_gen = 0;
        u64 right_gen = 0;
 
-       ret = close_cur_inode_file(sctx);
-       if (ret < 0)
-               goto out;
-
        sctx->cur_ino = key->objectid;
        sctx->cur_inode_new_gen = 0;
 
@@ -4686,11 +4658,6 @@ static int send_subvol(struct send_ctx *sctx)
        }
 
 out:
-       if (!ret)
-               ret = close_cur_inode_file(sctx);
-       else
-               close_cur_inode_file(sctx);
-
        free_recorded_refs(sctx);
        return ret;
 }
index e913328..2d8ac1b 100644 (file)
@@ -42,7 +42,6 @@
 #include <linux/cleancache.h>
 #include <linux/ratelimit.h>
 #include <linux/btrfs.h>
-#include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -921,7 +920,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
 
-       btrfs_wait_all_ordered_extents(fs_info);
+       btrfs_wait_ordered_roots(fs_info, -1);
 
        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
@@ -1330,6 +1329,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                 * this also happens on 'umount -rf' or on shutdown, when
                 * the filesystem is busy.
                 */
+
+               /* wait for the uuid_scan task to finish */
+               down(&fs_info->uuid_tree_rescan_sem);
+               /* avoid complains from lockdep et al. */
+               up(&fs_info->uuid_tree_rescan_sem);
+
                sb->s_flags |= MS_RDONLY;
 
                btrfs_dev_replace_suspend_for_unmount(fs_info);
@@ -1465,7 +1470,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        nr_devices = fs_info->fs_devices->open_devices;
        BUG_ON(!nr_devices);
 
-       devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+       devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
                               GFP_NOFS);
        if (!devices_info)
                return -ENOMEM;
@@ -1789,7 +1794,25 @@ static void btrfs_print_info(void)
 
 static int btrfs_run_sanity_tests(void)
 {
-       return btrfs_test_free_space_cache();
+       int ret;
+
+       ret = btrfs_init_test_fs();
+       if (ret)
+               return ret;
+
+       ret = btrfs_test_free_space_cache();
+       if (ret)
+               goto out;
+       ret = btrfs_test_extent_buffer_operations();
+       if (ret)
+               goto out;
+       ret = btrfs_test_extent_io();
+       if (ret)
+               goto out;
+       ret = btrfs_test_inodes();
+out:
+       btrfs_destroy_test_fs();
+       return ret;
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
new file mode 100644 (file)
index 0000000..757ef00
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+static const struct super_operations btrfs_test_super_ops = {
+       .alloc_inode    = btrfs_alloc_inode,
+       .destroy_inode  = btrfs_test_destroy_inode,
+};
+
+static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
+                                      int flags, const char *dev_name,
+                                      void *data)
+{
+       return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+                           NULL, BTRFS_TEST_MAGIC);
+}
+
+static struct file_system_type test_type = {
+       .name           = "btrfs_test_fs",
+       .mount          = btrfs_test_mount,
+       .kill_sb        = kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+       return new_inode(test_mnt->mnt_sb);
+}
+
+int btrfs_init_test_fs(void)
+{
+       int ret;
+
+       ret = register_filesystem(&test_type);
+       if (ret) {
+               printk(KERN_ERR "btrfs: cannot register test file system\n");
+               return ret;
+       }
+
+       test_mnt = kern_mount(&test_type);
+       if (IS_ERR(test_mnt)) {
+               printk(KERN_ERR "btrfs: cannot mount test file system\n");
+               unregister_filesystem(&test_type);
+               return ret;
+       }
+       return 0;
+}
+
+void btrfs_destroy_test_fs(void)
+{
+       kern_unmount(test_mnt);
+       unregister_filesystem(&test_type);
+}
index 5808776..b353bc8 100644 (file)
 #define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__)
 
 int btrfs_test_free_space_cache(void);
+int btrfs_test_extent_buffer_operations(void);
+int btrfs_test_extent_io(void);
+int btrfs_test_inodes(void);
+int btrfs_init_test_fs(void);
+void btrfs_destroy_test_fs(void);
+struct inode *btrfs_new_test_inode(void);
 #else
 static inline int btrfs_test_free_space_cache(void)
 {
        return 0;
 }
+static inline int btrfs_test_extent_buffer_operations(void)
+{
+       return 0;
+}
+static inline int btrfs_init_test_fs(void)
+{
+       return 0;
+}
+static inline void btrfs_destroy_test_fs(void)
+{
+}
+static inline int btrfs_test_extent_io(void)
+{
+       return 0;
+}
+static inline int btrfs_test_inodes(void)
+{
+       return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
new file mode 100644 (file)
index 0000000..cc286ce
--- /dev/null
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../disk-io.h"
+
+static int test_btrfs_split_item(void)
+{
+       struct btrfs_path *path;
+       struct btrfs_root *root;
+       struct extent_buffer *eb;
+       struct btrfs_item *item;
+       char *value = "mary had a little lamb";
+       char *split1 = "mary had a little";
+       char *split2 = " lamb";
+       char *split3 = "mary";
+       char *split4 = " had a little";
+       char buf[32];
+       struct btrfs_key key;
+       u32 value_len = strlen(value);
+       int ret = 0;
+
+       test_msg("Running btrfs_split_item tests\n");
+
+       root = btrfs_alloc_dummy_root();
+       if (IS_ERR(root)) {
+               test_msg("Could not allocate root\n");
+               return PTR_ERR(root);
+       }
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               test_msg("Could not allocate path\n");
+               kfree(root);
+               return -ENOMEM;
+       }
+
+       path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+       if (!eb) {
+               test_msg("Could not allocate dummy buffer\n");
+               ret = -ENOMEM;
+               goto out;
+       }
+       path->slots[0] = 0;
+
+       key.objectid = 0;
+       key.type = BTRFS_EXTENT_CSUM_KEY;
+       key.offset = 0;
+
+       setup_items_for_insert(root, path, &key, &value_len, value_len,
+                              value_len + sizeof(struct btrfs_item), 1);
+       item = btrfs_item_nr(0);
+       write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+                           value_len);
+
+       key.offset = 3;
+
+       /*
+        * Passing NULL trans here should be safe because we have plenty of
+        * space in this leaf to split the item without having to split the
+        * leaf.
+        */
+       ret = btrfs_split_item(NULL, root, path, &key, 17);
+       if (ret) {
+               test_msg("Split item failed %d\n", ret);
+               goto out;
+       }
+
+       /*
+        * Read the first slot, it should have the original key and contain only
+        * 'mary had a little'
+        */
+       btrfs_item_key_to_cpu(eb, &key, 0);
+       if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+           key.offset != 0) {
+               test_msg("Invalid key at slot 0\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       item = btrfs_item_nr(0);
+       if (btrfs_item_size(eb, item) != strlen(split1)) {
+               test_msg("Invalid len in the first split\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+                          strlen(split1));
+       if (memcmp(buf, split1, strlen(split1))) {
+               test_msg("Data in the buffer doesn't match what it should "
+                        "in the first split have='%.*s' want '%s'\n",
+                        (int)strlen(split1), buf, split1);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       btrfs_item_key_to_cpu(eb, &key, 1);
+       if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+           key.offset != 3) {
+               test_msg("Invalid key at slot 1\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       item = btrfs_item_nr(1);
+       if (btrfs_item_size(eb, item) != strlen(split2)) {
+               test_msg("Invalid len in the second split\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+                          strlen(split2));
+       if (memcmp(buf, split2, strlen(split2))) {
+               test_msg("Data in the buffer doesn't match what it should "
+                        "in the second split\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       key.offset = 1;
+       /* Do it again so we test memmoving the other items in the leaf */
+       ret = btrfs_split_item(NULL, root, path, &key, 4);
+       if (ret) {
+               test_msg("Second split item failed %d\n", ret);
+               goto out;
+       }
+
+       btrfs_item_key_to_cpu(eb, &key, 0);
+       if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+           key.offset != 0) {
+               test_msg("Invalid key at slot 0\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       item = btrfs_item_nr(0);
+       if (btrfs_item_size(eb, item) != strlen(split3)) {
+               test_msg("Invalid len in the first split\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+                          strlen(split3));
+       if (memcmp(buf, split3, strlen(split3))) {
+               test_msg("Data in the buffer doesn't match what it should "
+                        "in the third split");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       btrfs_item_key_to_cpu(eb, &key, 1);
+       if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+           key.offset != 1) {
+               test_msg("Invalid key at slot 1\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       item = btrfs_item_nr(1);
+       if (btrfs_item_size(eb, item) != strlen(split4)) {
+               test_msg("Invalid len in the second split\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+                          strlen(split4));
+       if (memcmp(buf, split4, strlen(split4))) {
+               test_msg("Data in the buffer doesn't match what it should "
+                        "in the fourth split\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       btrfs_item_key_to_cpu(eb, &key, 2);
+       if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+           key.offset != 3) {
+               test_msg("Invalid key at slot 2\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       item = btrfs_item_nr(2);
+       if (btrfs_item_size(eb, item) != strlen(split2)) {
+               test_msg("Invalid len in the second split\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
+                          strlen(split2));
+       if (memcmp(buf, split2, strlen(split2))) {
+               test_msg("Data in the buffer doesn't match what it should "
+                        "in the last chunk\n");
+               ret = -EINVAL;
+               goto out;
+       }
+out:
+       btrfs_free_path(path);
+       kfree(root);
+       return ret;
+}
+
+int btrfs_test_extent_buffer_operations(void)
+{
+       test_msg("Running extent buffer operation tests");
+       return test_btrfs_split_item();
+}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
new file mode 100644 (file)
index 0000000..7e99c2f
--- /dev/null
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "btrfs-tests.h"
+#include "../extent_io.h"
+
+#define PROCESS_UNLOCK         (1 << 0)
+#define PROCESS_RELEASE                (1 << 1)
+#define PROCESS_TEST_LOCKED    (1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+                                      unsigned long flags)
+{
+       int ret;
+       struct page *pages[16];
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+       unsigned long nr_pages = end_index - index + 1;
+       int i;
+       int count = 0;
+       int loops = 0;
+
+       while (nr_pages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min_t(unsigned long, nr_pages,
+                                    ARRAY_SIZE(pages)), pages);
+               for (i = 0; i < ret; i++) {
+                       if (flags & PROCESS_TEST_LOCKED &&
+                           !PageLocked(pages[i]))
+                               count++;
+                       if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
+                               unlock_page(pages[i]);
+                       page_cache_release(pages[i]);
+                       if (flags & PROCESS_RELEASE)
+                               page_cache_release(pages[i]);
+               }
+               nr_pages -= ret;
+               index += ret;
+               cond_resched();
+               loops++;
+               if (loops > 100000) {
+                       printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+                       break;
+               }
+       }
+       return count;
+}
+
+static int test_find_delalloc(void)
+{
+       struct inode *inode;
+       struct extent_io_tree tmp;
+       struct page *page;
+       struct page *locked_page = NULL;
+       unsigned long index = 0;
+       u64 total_dirty = 256 * 1024 * 1024;
+       u64 max_bytes = 128 * 1024 * 1024;
+       u64 start, end, test_start;
+       u64 found;
+       int ret = -EINVAL;
+
+       inode = btrfs_new_test_inode();
+       if (!inode) {
+               test_msg("Failed to allocate test inode\n");
+               return -ENOMEM;
+       }
+
+       extent_io_tree_init(&tmp, &inode->i_data);
+
+       /*
+        * First go through and create and mark all of our pages dirty, we pin
+        * everything to make sure our pages don't get evicted and screw up our
+        * test.
+        */
+       for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+               page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+               if (!page) {
+                       test_msg("Failed to allocate test page\n");
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               SetPageDirty(page);
+               if (index) {
+                       unlock_page(page);
+               } else {
+                       page_cache_get(page);
+                       locked_page = page;
+               }
+       }
+
+       /* Test this scenario
+        * |--- delalloc ---|
+        * |---  search  ---|
+        */
+       set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+       start = 0;
+       end = 0;
+       found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+                                        &end, max_bytes);
+       if (!found) {
+               test_msg("Should have found at least one delalloc\n");
+               goto out_bits;
+       }
+       if (start != 0 || end != 4095) {
+               test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
+                        start, end);
+               goto out_bits;
+       }
+       unlock_extent(&tmp, start, end);
+       unlock_page(locked_page);
+       page_cache_release(locked_page);
+
+       /*
+        * Test this scenario
+        *
+        * |--- delalloc ---|
+        *           |--- search ---|
+        */
+       test_start = 64 * 1024 * 1024;
+       locked_page = find_lock_page(inode->i_mapping,
+                                    test_start >> PAGE_CACHE_SHIFT);
+       if (!locked_page) {
+               test_msg("Couldn't find the locked page\n");
+               goto out_bits;
+       }
+       set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+       start = test_start;
+       end = 0;
+       found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+                                        &end, max_bytes);
+       if (!found) {
+               test_msg("Couldn't find delalloc in our range\n");
+               goto out_bits;
+       }
+       if (start != test_start || end != max_bytes - 1) {
+               test_msg("Expected start %Lu end %Lu, got start %Lu, end "
+                        "%Lu\n", test_start, max_bytes - 1, start, end);
+               goto out_bits;
+       }
+       if (process_page_range(inode, start, end,
+                              PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+               test_msg("There were unlocked pages in the range\n");
+               goto out_bits;
+       }
+       unlock_extent(&tmp, start, end);
+       /* locked_page was unlocked above */
+       page_cache_release(locked_page);
+
+       /*
+        * Test this scenario
+        * |--- delalloc ---|
+        *                    |--- search ---|
+        */
+       test_start = max_bytes + 4096;
+       locked_page = find_lock_page(inode->i_mapping, test_start >>
+                                    PAGE_CACHE_SHIFT);
+       if (!locked_page) {
+               test_msg("Could'nt find the locked page\n");
+               goto out_bits;
+       }
+       start = test_start;
+       end = 0;
+       found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+                                        &end, max_bytes);
+       if (found) {
+               test_msg("Found range when we shouldn't have\n");
+               goto out_bits;
+       }
+       if (end != (u64)-1) {
+               test_msg("Did not return the proper end offset\n");
+               goto out_bits;
+       }
+
+       /*
+        * Test this scenario
+        * [------- delalloc -------|
+        * [max_bytes]|-- search--|
+        *
+        * We are re-using our test_start from above since it works out well.
+        */
+       set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+       start = test_start;
+       end = 0;
+       found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+                                        &end, max_bytes);
+       if (!found) {
+               test_msg("Didn't find our range\n");
+               goto out_bits;
+       }
+       if (start != test_start || end != total_dirty - 1) {
+               test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+                        test_start, total_dirty - 1, start, end);
+               goto out_bits;
+       }
+       if (process_page_range(inode, start, end,
+                              PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+               test_msg("Pages in range were not all locked\n");
+               goto out_bits;
+       }
+       unlock_extent(&tmp, start, end);
+
+       /*
+        * Now to test where we run into a page that is no longer dirty in the
+        * range we want to find.
+        */
+       page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
+                            >> PAGE_CACHE_SHIFT);
+       if (!page) {
+               test_msg("Couldn't find our page\n");
+               goto out_bits;
+       }
+       ClearPageDirty(page);
+       page_cache_release(page);
+
+       /* We unlocked it in the previous test */
+       lock_page(locked_page);
+       start = test_start;
+       end = 0;
+       /*
+        * Currently if we fail to find dirty pages in the delalloc range we
+        * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search.  If
+        * this changes at any point in the future we will need to fix this
+        * tests expected behavior.
+        */
+       found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+                                        &end, max_bytes);
+       if (!found) {
+               test_msg("Didn't find our range\n");
+               goto out_bits;
+       }
+       if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+               test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+                        test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+                        end);
+               goto out_bits;
+       }
+       if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+                              PROCESS_UNLOCK)) {
+               test_msg("Pages in range were not all locked\n");
+               goto out_bits;
+       }
+       ret = 0;
+out_bits:
+       clear_extent_bits(&tmp, 0, total_dirty - 1,
+                         (unsigned long)-1, GFP_NOFS);
+out:
+       if (locked_page)
+               page_cache_release(locked_page);
+       process_page_range(inode, 0, total_dirty - 1,
+                          PROCESS_UNLOCK | PROCESS_RELEASE);
+       iput(inode);
+       return ret;
+}
+
+int btrfs_test_extent_io(void)
+{
+       test_msg("Running find delalloc tests\n");
+       return test_find_delalloc();
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
new file mode 100644 (file)
index 0000000..397d1f9
--- /dev/null
@@ -0,0 +1,955 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../btrfs_inode.h"
+#include "../disk-io.h"
+#include "../extent_io.h"
+#include "../volumes.h"
+
+static struct btrfs_fs_info *alloc_dummy_fs_info(void)
+{
+       struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+                                               GFP_NOFS);
+       if (!fs_info)
+               return fs_info;
+       fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+                                     GFP_NOFS);
+       if (!fs_info->fs_devices) {
+               kfree(fs_info);
+               return NULL;
+       }
+       return fs_info;
+}
+static void free_dummy_root(struct btrfs_root *root)
+{
+       if (!root)
+               return;
+       if (root->fs_info) {
+               kfree(root->fs_info->fs_devices);
+               kfree(root->fs_info);
+       }
+       if (root->node)
+               free_extent_buffer(root->node);
+       kfree(root);
+}
+
+static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
+                         u64 ram_bytes, u64 offset, u64 disk_bytenr,
+                         u64 disk_len, u32 type, u8 compression, int slot)
+{
+       struct btrfs_path path;
+       struct btrfs_file_extent_item *fi;
+       struct extent_buffer *leaf = root->node;
+       struct btrfs_key key;
+       u32 value_len = sizeof(struct btrfs_file_extent_item);
+
+       if (type == BTRFS_FILE_EXTENT_INLINE)
+               value_len += len;
+       memset(&path, 0, sizeof(path));
+
+       path.nodes[0] = leaf;
+       path.slots[0] = slot;
+
+       key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = start;
+
+       setup_items_for_insert(root, &path, &key, &value_len, value_len,
+                              value_len + sizeof(struct btrfs_item), 1);
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+       btrfs_set_file_extent_generation(leaf, fi, 1);
+       btrfs_set_file_extent_type(leaf, fi, type);
+       btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+       btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len);
+       btrfs_set_file_extent_offset(leaf, fi, offset);
+       btrfs_set_file_extent_num_bytes(leaf, fi, len);
+       btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+       btrfs_set_file_extent_compression(leaf, fi, compression);
+       btrfs_set_file_extent_encryption(leaf, fi, 0);
+       btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+}
+
+static void insert_inode_item_key(struct btrfs_root *root)
+{
+       struct btrfs_path path;
+       struct extent_buffer *leaf = root->node;
+       struct btrfs_key key;
+       u32 value_len = 0;
+
+       memset(&path, 0, sizeof(path));
+
+       path.nodes[0] = leaf;
+       path.slots[0] = 0;
+
+       key.objectid = BTRFS_INODE_ITEM_KEY;
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+
+       setup_items_for_insert(root, &path, &key, &value_len, value_len,
+                              value_len + sizeof(struct btrfs_item), 1);
+}
+
+/*
+ * Build the most complicated map of extents the earth has ever seen.  We want
+ * this so we can test all of the corner cases of btrfs_get_extent.  Here is a
+ * diagram of how the extents will look though this may not be possible we still
+ * want to make sure everything acts normally (the last number is not inclusive)
+ *
+ * [0 - 5][5 -  6][6 - 10][10 - 4096][  4096 - 8192 ][8192 - 12288]
+ * [hole ][inline][ hole ][ regular ][regular1 split][    hole    ]
+ *
+ * [ 12288 - 20480][20480 - 24576][  24576 - 28672  ][28672 - 36864][36864 - 45056]
+ * [regular1 split][   prealloc1 ][prealloc1 written][   prealloc1 ][ compressed  ]
+ *
+ * [45056 - 49152][49152-53248][53248-61440][61440-65536][     65536+81920   ]
+ * [ compressed1 ][  regular  ][compressed1][  regular  ][ hole but no extent]
+ *
+ * [81920-86016]
+ * [  regular  ]
+ */
+static void setup_file_extents(struct btrfs_root *root)
+{
+       int slot = 0;
+       u64 disk_bytenr = 1 * 1024 * 1024;
+       u64 offset = 0;
+
+       /* First we want a hole */
+       insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+                     slot);
+       slot++;
+       offset += 5;
+
+       /*
+        * Now we want an inline extent, I don't think this is possible but hey
+        * why not?  Also keep in mind if we have an inline extent it counts as
+        * the whole first page.  If we were to expand it we would have to cow
+        * and we wouldn't have an inline extent anymore.
+        */
+       insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+                     slot);
+       slot++;
+       offset = 4096;
+
+       /* Now another hole */
+       insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+                     slot);
+       slot++;
+       offset += 4;
+
+       /* Now for a regular extent */
+       insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
+                     BTRFS_FILE_EXTENT_REG, 0, slot);
+       slot++;
+       disk_bytenr += 4096;
+       offset += 4095;
+
+       /*
+        * Now for 3 extents that were split from a hole punch so we test
+        * offsets properly.
+        */
+       insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+                     BTRFS_FILE_EXTENT_REG, 0, slot);
+       slot++;
+       offset += 4096;
+       insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
+                     0, slot);
+       slot++;
+       offset += 4096;
+       insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+                     BTRFS_FILE_EXTENT_REG, 0, slot);
+       slot++;
+       offset += 8192;
+       disk_bytenr += 16384;
+
+       /* Now for a unwritten prealloc extent */
+       insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+                     BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+       slot++;
+       offset += 4096;
+
+       /*
+        * We want to jack up disk_bytenr a little more so the em stuff doesn't
+        * merge our records.
+        */
+       disk_bytenr += 8192;
+
+       /*
+        * Now for a partially written prealloc extent, basically the same as
+        * the hole punch example above.  Ram_bytes never changes when you mark
+        * extents written btw.
+        */
+       insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+                     BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+       slot++;
+       offset += 4096;
+       insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
+                     BTRFS_FILE_EXTENT_REG, 0, slot);
+       slot++;
+       offset += 4096;
+       insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+                     BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+       slot++;
+       offset += 8192;
+       disk_bytenr += 16384;
+
+       /* Now a normal compressed extent */
+       insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
+                     BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+       slot++;
+       offset += 8192;
+       /* No merges */
+       disk_bytenr += 8192;
+
+       /* Now a split compressed extent */
+       insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
+                     BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+       slot++;
+       offset += 4096;
+       insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+                     BTRFS_FILE_EXTENT_REG, 0, slot);
+       slot++;
+       offset += 4096;
+       insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+                     BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+       slot++;
+       offset += 8192;
+       disk_bytenr += 8192;
+
+       /* Now extents that have a hole but no hole extent */
+       insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+                     BTRFS_FILE_EXTENT_REG, 0, slot);
+       slot++;
+       offset += 16384;
+       disk_bytenr += 4096;
+       insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+                     BTRFS_FILE_EXTENT_REG, 0, slot);
+}
+
+static unsigned long prealloc_only = 0;
+static unsigned long compressed_only = 0;
+static unsigned long vacancy_only = 0;
+
+static noinline int test_btrfs_get_extent(void)
+{
+       struct inode *inode = NULL;
+       struct btrfs_root *root = NULL;
+       struct extent_map *em = NULL;
+       u64 orig_start;
+       u64 disk_bytenr;
+       u64 offset;
+       int ret = -ENOMEM;
+
+       inode = btrfs_new_test_inode();
+       if (!inode) {
+               test_msg("Couldn't allocate inode\n");
+               return ret;
+       }
+
+       BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+       BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+       BTRFS_I(inode)->location.offset = 0;
+
+       root = btrfs_alloc_dummy_root();
+       if (IS_ERR(root)) {
+               test_msg("Couldn't allocate root\n");
+               goto out;
+       }
+
+       /*
+        * We do this since btrfs_get_extent wants to assign em->bdev to
+        * root->fs_info->fs_devices->latest_bdev.
+        */
+       root->fs_info = alloc_dummy_fs_info();
+       if (!root->fs_info) {
+               test_msg("Couldn't allocate dummy fs info\n");
+               goto out;
+       }
+
+       root->node = alloc_dummy_extent_buffer(0, 4096);
+       if (!root->node) {
+               test_msg("Couldn't allocate dummy buffer\n");
+               goto out;
+       }
+
+       /*
+        * We will just free a dummy node if it's ref count is 2 so we need an
+        * extra ref so our searches don't accidently release our page.
+        */
+       extent_buffer_get(root->node);
+       btrfs_set_header_nritems(root->node, 0);
+       btrfs_set_header_level(root->node, 0);
+       ret = -EINVAL;
+
+       /* First with no extents */
+       BTRFS_I(inode)->root = root;
+       em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+       if (IS_ERR(em)) {
+               em = NULL;
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != EXTENT_MAP_HOLE) {
+               test_msg("Expected a hole, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+               test_msg("Vacancy flag wasn't set properly\n");
+               goto out;
+       }
+       free_extent_map(em);
+       btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+
+       /*
+        * All of the magic numbers are based on the mapping setup in
+        * setup_file_extents, so if you change anything there you need to
+        * update the comment and update the expected values below.
+        */
+       setup_file_extents(root);
+
+       em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != EXTENT_MAP_HOLE) {
+               test_msg("Expected a hole, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != 0 || em->len != 5) {
+               test_msg("Unexpected extent wanted start 0 len 5, got start "
+                        "%llu len %llu\n", em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != EXTENT_MAP_INLINE) {
+               test_msg("Expected an inline, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4091) {
+               test_msg("Unexpected extent wanted start %llu len 1, got start "
+                        "%llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       /*
+        * We don't test anything else for inline since it doesn't get set
+        * unless we have a page for it to write into.  Maybe we should change
+        * this?
+        */
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != EXTENT_MAP_HOLE) {
+               test_msg("Expected a hole, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4) {
+               test_msg("Unexpected extent wanted start %llu len 4, got start "
+                        "%llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       /* Regular extent */
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4095) {
+               test_msg("Unexpected extent wanted start %llu len 4095, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+                        em->orig_start);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       /* The next 3 are split extents */
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+                        em->orig_start);
+               goto out;
+       }
+       disk_bytenr = em->block_start;
+       orig_start = em->start;
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != EXTENT_MAP_HOLE) {
+               test_msg("Expected a hole, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 8192) {
+               test_msg("Unexpected extent wanted start %llu len 8192, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       if (em->orig_start != orig_start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n",
+                        orig_start, em->orig_start);
+               goto out;
+       }
+       disk_bytenr += (em->start - orig_start);
+       if (em->block_start != disk_bytenr) {
+               test_msg("Wrong block start, want %llu, have %llu\n",
+                        disk_bytenr, em->block_start);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       /* Prealloc extent */
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != prealloc_only) {
+               test_msg("Unexpected flags set, want %lu have %lu\n",
+                        prealloc_only, em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+                        em->orig_start);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       /* The next 3 are a half written prealloc extent */
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != prealloc_only) {
+               test_msg("Unexpected flags set, want %lu have %lu\n",
+                        prealloc_only, em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+                        em->orig_start);
+               goto out;
+       }
+       disk_bytenr = em->block_start;
+       orig_start = em->start;
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_HOLE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       if (em->orig_start != orig_start) {
+               test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+                        orig_start, em->orig_start);
+               goto out;
+       }
+       if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+               test_msg("Unexpected block start, wanted %llu, have %llu\n",
+                        disk_bytenr + (em->start - em->orig_start),
+                        em->block_start);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 8192) {
+               test_msg("Unexpected extent wanted start %llu len 8192, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != prealloc_only) {
+               test_msg("Unexpected flags set, want %lu have %lu\n",
+                        prealloc_only, em->flags);
+               goto out;
+       }
+       if (em->orig_start != orig_start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+                        em->orig_start);
+               goto out;
+       }
+       if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+               test_msg("Unexpected block start, wanted %llu, have %llu\n",
+                        disk_bytenr + (em->start - em->orig_start),
+                        em->block_start);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       /* Now for the compressed extent */
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 8192) {
+               test_msg("Unexpected extent wanted start %llu len 8192, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != compressed_only) {
+               test_msg("Unexpected flags set, want %lu have %lu\n",
+                        compressed_only, em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n",
+                        em->start, em->orig_start);
+               goto out;
+       }
+       if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+               test_msg("Unexpected compress type, wanted %d, got %d\n",
+                        BTRFS_COMPRESS_ZLIB, em->compress_type);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       /* Split compressed extent */
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != compressed_only) {
+               test_msg("Unexpected flags set, want %lu have %lu\n",
+                        compressed_only, em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n",
+                        em->start, em->orig_start);
+               goto out;
+       }
+       if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+               test_msg("Unexpected compress type, wanted %d, got %d\n",
+                        BTRFS_COMPRESS_ZLIB, em->compress_type);
+               goto out;
+       }
+       disk_bytenr = em->block_start;
+       orig_start = em->start;
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+                        em->orig_start);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != disk_bytenr) {
+               test_msg("Block start does not match, want %llu got %llu\n",
+                        disk_bytenr, em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 8192) {
+               test_msg("Unexpected extent wanted start %llu len 8192, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != compressed_only) {
+               test_msg("Unexpected flags set, want %lu have %lu\n",
+                        compressed_only, em->flags);
+               goto out;
+       }
+       if (em->orig_start != orig_start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n",
+                        em->start, orig_start);
+               goto out;
+       }
+       if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+               test_msg("Unexpected compress type, wanted %d, got %d\n",
+                        BTRFS_COMPRESS_ZLIB, em->compress_type);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       /* A hole between regular extents but no hole extent */
+       em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+                        em->orig_start);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != EXTENT_MAP_HOLE) {
+               test_msg("Expected a hole extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       /*
+        * Currently we just return a length that we requested rather than the
+        * length of the actual hole, if this changes we'll have to change this
+        * test.
+        */
+       if (em->start != offset || em->len != 12288) {
+               test_msg("Unexpected extent wanted start %llu len 12288, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != vacancy_only) {
+               test_msg("Unexpected flags set, want %lu have %lu\n",
+                        vacancy_only, em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+                        em->orig_start);
+               goto out;
+       }
+       offset = em->start + em->len;
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != offset || em->len != 4096) {
+               test_msg("Unexpected extent wanted start %llu len 4096, got "
+                        "start %llu len %llu\n", offset, em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+               goto out;
+       }
+       if (em->orig_start != em->start) {
+               test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+                        em->orig_start);
+               goto out;
+       }
+       ret = 0;
+out:
+       if (!IS_ERR(em))
+               free_extent_map(em);
+       iput(inode);
+       free_dummy_root(root);
+       return ret;
+}
+
+static int test_hole_first(void)
+{
+       struct inode *inode = NULL;
+       struct btrfs_root *root = NULL;
+       struct extent_map *em = NULL;
+       int ret = -ENOMEM;
+
+       inode = btrfs_new_test_inode();
+       if (!inode) {
+               test_msg("Couldn't allocate inode\n");
+               return ret;
+       }
+
+       BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+       BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+       BTRFS_I(inode)->location.offset = 0;
+
+       root = btrfs_alloc_dummy_root();
+       if (IS_ERR(root)) {
+               test_msg("Couldn't allocate root\n");
+               goto out;
+       }
+
+       root->fs_info = alloc_dummy_fs_info();
+       if (!root->fs_info) {
+               test_msg("Couldn't allocate dummy fs info\n");
+               goto out;
+       }
+
+       root->node = alloc_dummy_extent_buffer(0, 4096);
+       if (!root->node) {
+               test_msg("Couldn't allocate dummy buffer\n");
+               goto out;
+       }
+
+       extent_buffer_get(root->node);
+       btrfs_set_header_nritems(root->node, 0);
+       btrfs_set_header_level(root->node, 0);
+       BTRFS_I(inode)->root = root;
+       ret = -EINVAL;
+
+       /*
+        * Need a blank inode item here just so we don't confuse
+        * btrfs_get_extent.
+        */
+       insert_inode_item_key(root);
+       insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
+                     BTRFS_FILE_EXTENT_REG, 0, 1);
+       em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != EXTENT_MAP_HOLE) {
+               test_msg("Expected a hole, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != 0 || em->len != 4096) {
+               test_msg("Unexpected extent wanted start 0 len 4096, got start "
+                        "%llu len %llu\n", em->start, em->len);
+               goto out;
+       }
+       if (em->flags != vacancy_only) {
+               test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+                        em->flags);
+               goto out;
+       }
+       free_extent_map(em);
+
+       em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+       if (IS_ERR(em)) {
+               test_msg("Got an error when we shouldn't have\n");
+               goto out;
+       }
+       if (em->block_start != 4096) {
+               test_msg("Expected a real extent, got %llu\n", em->block_start);
+               goto out;
+       }
+       if (em->start != 4096 || em->len != 4096) {
+               test_msg("Unexpected extent wanted start 4096 len 4096, got "
+                        "start %llu len %llu\n", em->start, em->len);
+               goto out;
+       }
+       if (em->flags != 0) {
+               test_msg("Unexpected flags set, wanted 0 got %lu\n",
+                        em->flags);
+               goto out;
+       }
+       ret = 0;
+out:
+       if (!IS_ERR(em))
+               free_extent_map(em);
+       iput(inode);
+       free_dummy_root(root);
+       return ret;
+}
+
+int btrfs_test_inodes(void)
+{
+       int ret;
+
+       set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+       set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+       set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
+       test_msg("Running btrfs_get_extent tests\n");
+       ret = test_btrfs_get_extent();
+       if (ret)
+               return ret;
+       test_msg("Running hole first btrfs_get_extent test\n");
+       return test_hole_first();
+}
index 8c81bdc..57c16b4 100644 (file)
@@ -57,7 +57,7 @@ static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
                                           __TRANS_JOIN_NOLOCK),
 };
 
-static void put_transaction(struct btrfs_transaction *transaction)
+void btrfs_put_transaction(struct btrfs_transaction *transaction)
 {
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
@@ -332,7 +332,7 @@ static void wait_current_trans(struct btrfs_root *root)
                wait_event(root->fs_info->transaction_wait,
                           cur_trans->state >= TRANS_STATE_UNBLOCKED ||
                           cur_trans->aborted);
-               put_transaction(cur_trans);
+               btrfs_put_transaction(cur_trans);
        } else {
                spin_unlock(&root->fs_info->trans_lock);
        }
@@ -353,6 +353,17 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
        return 0;
 }
 
+static inline bool need_reserve_reloc_root(struct btrfs_root *root)
+{
+       if (!root->fs_info->reloc_ctl ||
+           !root->ref_cows ||
+           root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+           root->reloc_root)
+               return false;
+
+       return true;
+}
+
 static struct btrfs_trans_handle *
 start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                  enum btrfs_reserve_flush_enum flush)
@@ -360,8 +371,9 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
        u64 num_bytes = 0;
-       int ret;
        u64 qgroup_reserved = 0;
+       bool reloc_reserved = false;
+       int ret;
 
        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return ERR_PTR(-EROFS);
@@ -390,6 +402,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                }
 
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+               /*
+                * Do the reservation for the relocation root creation
+                */
+               if (unlikely(need_reserve_reloc_root(root))) {
+                       num_bytes += root->nodesize;
+                       reloc_reserved = true;
+               }
+
                ret = btrfs_block_rsv_add(root,
                                          &root->fs_info->trans_block_rsv,
                                          num_bytes, flush);
@@ -451,6 +471,7 @@ again:
        h->delayed_ref_elem.seq = 0;
        h->type = type;
        h->allocating_chunk = false;
+       h->reloc_reserved = false;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
        INIT_LIST_HEAD(&h->new_bgs);
 
@@ -466,6 +487,7 @@ again:
                                              h->transid, num_bytes, 1);
                h->block_rsv = &root->fs_info->trans_block_rsv;
                h->bytes_reserved = num_bytes;
+               h->reloc_reserved = reloc_reserved;
        }
        h->qgroup_reserved = qgroup_reserved;
 
@@ -610,7 +632,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
        }
 
        wait_for_commit(root, cur_trans);
-       put_transaction(cur_trans);
+       btrfs_put_transaction(cur_trans);
 out:
        return ret;
 }
@@ -735,7 +757,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
                wake_up(&cur_trans->writer_wait);
-       put_transaction(cur_trans);
+       btrfs_put_transaction(cur_trans);
 
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -744,8 +766,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                btrfs_run_delayed_iputs(root);
 
        if (trans->aborted ||
-           test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+           test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+               wake_up_process(info->transaction_kthread);
                err = -EIO;
+       }
        assert_qgroups_uptodate(trans);
 
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -948,16 +972,19 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                return ret;
 
        ret = btrfs_run_dev_stats(trans, root->fs_info);
-       WARN_ON(ret);
+       if (ret)
+               return ret;
        ret = btrfs_run_dev_replace(trans, root->fs_info);
-       WARN_ON(ret);
-
+       if (ret)
+               return ret;
        ret = btrfs_run_qgroups(trans, root->fs_info);
-       BUG_ON(ret);
+       if (ret)
+               return ret;
 
        /* run_qgroups might have added some more refs */
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-       BUG_ON(ret);
+       if (ret)
+               return ret;
 
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
@@ -1510,7 +1537,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        if (current->journal_info == trans)
                current->journal_info = NULL;
 
-       put_transaction(cur_trans);
+       btrfs_put_transaction(cur_trans);
        return 0;
 }
 
@@ -1552,8 +1579,10 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
                root->fs_info->running_transaction = NULL;
        spin_unlock(&root->fs_info->trans_lock);
 
-       put_transaction(cur_trans);
-       put_transaction(cur_trans);
+       if (trans->type & __TRANS_FREEZABLE)
+               sb_end_intwrite(root->fs_info->sb);
+       btrfs_put_transaction(cur_trans);
+       btrfs_put_transaction(cur_trans);
 
        trace_btrfs_transaction_commit(root);
 
@@ -1571,15 +1600,19 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
        int ret;
 
        ret = btrfs_run_delayed_items(trans, root);
-       if (ret)
-               return ret;
-
        /*
         * running the delayed items may have added new refs. account
         * them now so that they hinder processing of more delayed refs
         * as little as possible.
         */
-       btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+       if (ret) {
+               btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+               return ret;
+       }
+
+       ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+       if (ret)
+               return ret;
 
        /*
         * rename don't use btrfs_join_transaction, so, once we
@@ -1596,14 +1629,14 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
        if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-               return btrfs_start_all_delalloc_inodes(fs_info, 1);
+               return btrfs_start_delalloc_roots(fs_info, 1);
        return 0;
 }
 
 static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
        if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-               btrfs_wait_all_ordered_extents(fs_info);
+               btrfs_wait_ordered_roots(fs_info, -1);
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1669,7 +1702,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
                wait_for_commit(root, cur_trans);
 
-               put_transaction(cur_trans);
+               btrfs_put_transaction(cur_trans);
 
                return ret;
        }
@@ -1686,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
                        wait_for_commit(root, prev_trans);
 
-                       put_transaction(prev_trans);
+                       btrfs_put_transaction(prev_trans);
                } else {
                        spin_unlock(&root->fs_info->trans_lock);
                }
@@ -1885,8 +1918,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        list_del_init(&cur_trans->list);
        spin_unlock(&root->fs_info->trans_lock);
 
-       put_transaction(cur_trans);
-       put_transaction(cur_trans);
+       btrfs_put_transaction(cur_trans);
+       btrfs_put_transaction(cur_trans);
 
        if (trans->type & __TRANS_FREEZABLE)
                sb_end_intwrite(root->fs_info->sb);
index 5c2af84..7657d11 100644 (file)
@@ -92,6 +92,7 @@ struct btrfs_trans_handle {
        short aborted;
        short adding_csums;
        bool allocating_chunk;
+       bool reloc_reserved;
        unsigned int type;
        /*
         * this root is only needed to validate that the root passed to
@@ -166,4 +167,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
 int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void btrfs_put_transaction(struct btrfs_transaction *transaction);
 #endif
index 94e05c1..76928ca 100644 (file)
@@ -37,7 +37,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        int ret = 0;
        int wret;
        int level;
-       int is_extent = 0;
        int next_key_ret = 0;
        u64 last_ret = 0;
        u64 min_trans = 0;
@@ -50,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                goto out;
        }
 
-       if (root->ref_cows == 0 && !is_extent)
+       if (root->ref_cows == 0)
                goto out;
 
        if (btrfs_test_opt(root, SSD))
@@ -85,7 +84,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
        path->keep_locks = 1;
 
-       ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
+       ret = btrfs_search_forward(root, &key, path, min_trans);
        if (ret < 0)
                goto out;
        if (ret > 0) {
index 79f057c..744553c 100644 (file)
@@ -26,7 +26,6 @@
 #include "locking.h"
 #include "print-tree.h"
 #include "backref.h"
-#include "compat.h"
 #include "tree-log.h"
 #include "hash.h"
 
@@ -936,7 +935,7 @@ again:
                                            parent_objectid,
                                            victim_name,
                                            victim_name_len)) {
-                               btrfs_inc_nlink(inode);
+                               inc_nlink(inode);
                                btrfs_release_path(path);
 
                                ret = btrfs_unlink_inode(trans, root, dir,
@@ -1006,7 +1005,7 @@ again:
                                victim_parent = read_one_inode(root,
                                                               parent_objectid);
                                if (victim_parent) {
-                                       btrfs_inc_nlink(inode);
+                                       inc_nlink(inode);
                                        btrfs_release_path(path);
 
                                        ret = btrfs_unlink_inode(trans, root,
@@ -1113,11 +1112,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct extent_buffer *eb, int slot,
                                  struct btrfs_key *key)
 {
-       struct inode *dir;
-       struct inode *inode;
+       struct inode *dir = NULL;
+       struct inode *inode = NULL;
        unsigned long ref_ptr;
        unsigned long ref_end;
-       char *name;
+       char *name = NULL;
        int namelen;
        int ret;
        int search_done = 0;
@@ -1150,13 +1149,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
         * care of the rest
         */
        dir = read_one_inode(root, parent_objectid);
-       if (!dir)
-               return -ENOENT;
+       if (!dir) {
+               ret = -ENOENT;
+               goto out;
+       }
 
        inode = read_one_inode(root, inode_objectid);
        if (!inode) {
-               iput(dir);
-               return -EIO;
+               ret = -EIO;
+               goto out;
        }
 
        while (ref_ptr < ref_end) {
@@ -1169,14 +1170,16 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                         */
                        if (!dir)
                                dir = read_one_inode(root, parent_objectid);
-                       if (!dir)
-                               return -ENOENT;
+                       if (!dir) {
+                               ret = -ENOENT;
+                               goto out;
+                       }
                } else {
                        ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
                                             &ref_index);
                }
                if (ret)
-                       return ret;
+                       goto out;
 
                /* if we already have a perfect match, we're done */
                if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
@@ -1196,12 +1199,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                                      parent_objectid,
                                                      ref_index, name, namelen,
                                                      &search_done);
-                               if (ret == 1) {
-                                       ret = 0;
+                               if (ret) {
+                                       if (ret == 1)
+                                               ret = 0;
                                        goto out;
                                }
-                               if (ret)
-                                       goto out;
                        }
 
                        /* insert our name */
@@ -1215,6 +1217,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
                ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
                kfree(name);
+               name = NULL;
                if (log_ref_ver) {
                        iput(dir);
                        dir = NULL;
@@ -1225,6 +1228,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
        ret = overwrite_item(trans, root, path, eb, slot, key);
 out:
        btrfs_release_path(path);
+       kfree(name);
        iput(dir);
        iput(inode);
        return ret;
@@ -1307,6 +1311,7 @@ static int count_inode_refs(struct btrfs_root *root,
                                break;
                        path->slots[0]--;
                }
+process_slot:
                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                      path->slots[0]);
                if (key.objectid != ino ||
@@ -1327,6 +1332,10 @@ static int count_inode_refs(struct btrfs_root *root,
 
                if (key.offset == 0)
                        break;
+               if (path->slots[0] > 0) {
+                       path->slots[0]--;
+                       goto process_slot;
+               }
                key.offset--;
                btrfs_release_path(path);
        }
@@ -1480,7 +1489,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                if (!inode->i_nlink)
                        set_nlink(inode, 1);
                else
-                       btrfs_inc_nlink(inode);
+                       inc_nlink(inode);
                ret = btrfs_update_inode(trans, root, inode);
        } else if (ret == -EEXIST) {
                ret = 0;
@@ -1823,7 +1832,7 @@ again:
                                                     dir_key->offset,
                                                     name, name_len, 0);
                }
-               if (IS_ERR_OR_NULL(log_di)) {
+               if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
                        btrfs_dir_item_key_to_cpu(eb, di, &location);
                        btrfs_release_path(path);
                        btrfs_release_path(log_path);
@@ -1841,7 +1850,7 @@ again:
                                goto out;
                        }
 
-                       btrfs_inc_nlink(inode);
+                       inc_nlink(inode);
                        ret = btrfs_unlink_inode(trans, root, dir, inode,
                                                 name, name_len);
                        if (!ret)
@@ -1860,6 +1869,9 @@ again:
                                goto again;
                        ret = 0;
                        goto out;
+               } else if (IS_ERR(log_di)) {
+                       kfree(name);
+                       return PTR_ERR(log_di);
                }
                btrfs_release_path(log_path);
                kfree(name);
@@ -2118,8 +2130,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                WARN_ON(*level >= BTRFS_MAX_LEVEL);
                cur = path->nodes[*level];
 
-               if (btrfs_header_level(cur) != *level)
-                       WARN_ON(1);
+               WARN_ON(btrfs_header_level(cur) != *level);
 
                if (path->slots[*level] >=
                    btrfs_header_nritems(cur))
@@ -2151,11 +2162,13 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                        return ret;
                                }
 
-                               btrfs_tree_lock(next);
-                               btrfs_set_lock_blocking(next);
-                               clean_tree_block(trans, root, next);
-                               btrfs_wait_tree_block_writeback(next);
-                               btrfs_tree_unlock(next);
+                               if (trans) {
+                                       btrfs_tree_lock(next);
+                                       btrfs_set_lock_blocking(next);
+                                       clean_tree_block(trans, root, next);
+                                       btrfs_wait_tree_block_writeback(next);
+                                       btrfs_tree_unlock(next);
+                               }
 
                                WARN_ON(root_owner !=
                                        BTRFS_TREE_LOG_OBJECTID);
@@ -2227,11 +2240,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
                                next = path->nodes[*level];
 
-                               btrfs_tree_lock(next);
-                               btrfs_set_lock_blocking(next);
-                               clean_tree_block(trans, root, next);
-                               btrfs_wait_tree_block_writeback(next);
-                               btrfs_tree_unlock(next);
+                               if (trans) {
+                                       btrfs_tree_lock(next);
+                                       btrfs_set_lock_blocking(next);
+                                       clean_tree_block(trans, root, next);
+                                       btrfs_wait_tree_block_writeback(next);
+                                       btrfs_tree_unlock(next);
+                               }
 
                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
                                ret = btrfs_free_and_pin_reserved_extent(root,
@@ -2301,11 +2316,13 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
                        next = path->nodes[orig_level];
 
-                       btrfs_tree_lock(next);
-                       btrfs_set_lock_blocking(next);
-                       clean_tree_block(trans, log, next);
-                       btrfs_wait_tree_block_writeback(next);
-                       btrfs_tree_unlock(next);
+                       if (trans) {
+                               btrfs_tree_lock(next);
+                               btrfs_set_lock_blocking(next);
+                               clean_tree_block(trans, log, next);
+                               btrfs_wait_tree_block_writeback(next);
+                               btrfs_tree_unlock(next);
+                       }
 
                        WARN_ON(log->root_key.objectid !=
                                BTRFS_TREE_LOG_OBJECTID);
@@ -2571,9 +2588,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * the running transaction open, so a full commit can't hop
         * in and cause problems either.
         */
-       btrfs_scrub_pause_super(root);
        ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
-       btrfs_scrub_continue_super(root);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto out_wake_log_root;
@@ -2608,13 +2623,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
                .process_func = process_one_buffer
        };
 
-       if (trans) {
-               ret = walk_log_tree(trans, log, &wc);
-
-               /* I don't think this can happen but just in case */
-               if (ret)
-                       btrfs_abort_transaction(trans, log, ret);
-       }
+       ret = walk_log_tree(trans, log, &wc);
+       /* I don't think this can happen but just in case */
+       if (ret)
+               btrfs_abort_transaction(trans, log, ret);
 
        while (1) {
                ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2867,7 +2879,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                          u64 min_offset, u64 *last_offset_ret)
 {
        struct btrfs_key min_key;
-       struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src;
        int err = 0;
@@ -2879,9 +2890,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        u64 ino = btrfs_ino(inode);
 
        log = root->log_root;
-       max_key.objectid = ino;
-       max_key.offset = (u64)-1;
-       max_key.type = key_type;
 
        min_key.objectid = ino;
        min_key.type = key_type;
@@ -2889,8 +2897,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
        path->keep_locks = 1;
 
-       ret = btrfs_search_forward(root, &min_key, &max_key,
-                                  path, trans->transid);
+       ret = btrfs_search_forward(root, &min_key, path, trans->transid);
 
        /*
         * we didn't find anything from this transaction, see if there
@@ -2943,10 +2950,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
        /* find the first key from this transaction again */
        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
-       if (ret != 0) {
-               WARN_ON(1);
+       if (WARN_ON(ret != 0))
                goto done;
-       }
 
        /*
         * we have a block from this transaction, log every item in it
@@ -3172,11 +3177,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
                          struct inode *inode)
 {
        struct btrfs_inode_item *inode_item;
-       struct btrfs_key key;
        int ret;
 
-       memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
-       ret = btrfs_insert_empty_item(trans, log, path, &key,
+       ret = btrfs_insert_empty_item(trans, log, path,
+                                     &BTRFS_I(inode)->location,
                                      sizeof(*inode_item));
        if (ret && ret != -EEXIST)
                return ret;
@@ -3375,7 +3379,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
                btrfs_set_token_file_extent_type(leaf, fi,
                                                 BTRFS_FILE_EXTENT_REG,
                                                 &token);
-               if (em->block_start == 0)
+               if (em->block_start == EXTENT_MAP_HOLE)
                        skip_csum = true;
        }
 
@@ -3417,11 +3421,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        if (skip_csum)
                return 0;
 
-       if (em->compress_type) {
-               csum_offset = 0;
-               csum_len = block_len;
-       }
-
        /*
         * First check and see if our csums are on our outstanding ordered
         * extents.
@@ -3505,8 +3504,13 @@ unlocked:
        if (!mod_len || ret)
                return ret;
 
-       csum_offset = mod_start - em->start;
-       csum_len = mod_len;
+       if (em->compress_type) {
+               csum_offset = 0;
+               csum_len = block_len;
+       } else {
+               csum_offset = mod_start - em->start;
+               csum_len = mod_len;
+       }
 
        /* block start is already adjusted for the file extent offset. */
        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
@@ -3719,7 +3723,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
        while (1) {
                ins_nr = 0;
-               ret = btrfs_search_forward(root, &min_key, &max_key,
+               ret = btrfs_search_forward(root, &min_key,
                                           path, trans->transid);
                if (ret != 0)
                        break;
@@ -3769,14 +3773,14 @@ next_slot:
                }
                btrfs_release_path(path);
 
-               if (min_key.offset < (u64)-1)
+               if (min_key.offset < (u64)-1) {
                        min_key.offset++;
-               else if (min_key.type < (u8)-1)
+               } else if (min_key.type < max_key.type) {
                        min_key.type++;
-               else if (min_key.objectid < (u64)-1)
-                       min_key.objectid++;
-               else
+                       min_key.offset = 0;
+               } else {
                        break;
+               }
        }
        if (ins_nr) {
                ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
index dd0dea3..fbda900 100644 (file)
@@ -260,7 +260,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
 {
        struct btrfs_root *root = fs_info->uuid_root;
        struct btrfs_key key;
-       struct btrfs_key max_key;
        struct btrfs_path *path;
        int ret = 0;
        struct extent_buffer *leaf;
@@ -277,13 +276,10 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
        key.objectid = 0;
        key.type = 0;
        key.offset = 0;
-       max_key.objectid = (u64)-1;
-       max_key.type = (u8)-1;
-       max_key.offset = (u64)-1;
 
 again_search_slot:
        path->keep_locks = 1;
-       ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+       ret = btrfs_search_forward(root, &key, path, 0);
        if (ret) {
                if (ret > 0)
                        ret = 0;
index 043b215..0db6370 100644 (file)
@@ -28,7 +28,6 @@
 #include <linux/raid/pq.h>
 #include <linux/semaphore.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -666,7 +665,8 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                if (device->bdev)
                        fs_devices->open_devices--;
 
-               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+               if (device->writeable &&
+                   device->devid != BTRFS_DEV_REPLACE_DEVID) {
                        list_del_init(&device->dev_alloc_list);
                        fs_devices->rw_devices--;
                }
@@ -2041,6 +2041,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->in_fs_metadata = 1;
        device->is_tgtdev_for_dev_replace = 0;
        device->mode = FMODE_EXCL;
+       device->dev_stats_valid = 1;
        set_blocksize(device->bdev, 4096);
 
        if (seeding_dev) {
@@ -2208,6 +2209,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
        device->in_fs_metadata = 1;
        device->is_tgtdev_for_dev_replace = 1;
        device->mode = FMODE_EXCL;
+       device->dev_stats_valid = 1;
        set_blocksize(device->bdev, 4096);
        device->fs_devices = fs_info->fs_devices;
        list_add(&device->dev_list, &fs_info->fs_devices->devices);
@@ -2550,8 +2552,7 @@ again:
                failed = 0;
                retried = true;
                goto again;
-       } else if (failed && retried) {
-               WARN_ON(1);
+       } else if (WARN_ON(failed && retried)) {
                ret = -ENOSPC;
        }
 error:
@@ -3423,6 +3424,9 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 {
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
        mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
                mutex_unlock(&fs_info->balance_mutex);
@@ -3488,7 +3492,7 @@ static int btrfs_uuid_scan_kthread(void *data)
        path->keep_locks = 1;
 
        while (1) {
-               ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+               ret = btrfs_search_forward(root, &key, path, 0);
                if (ret) {
                        if (ret > 0)
                                ret = 0;
@@ -4488,6 +4492,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
                btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
                            "%Lu-%Lu\n", logical, logical+len, em->start,
                            em->start + em->len);
+               free_extent_map(em);
                return 1;
        }
 
@@ -4668,6 +4673,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
                           "found %Lu-%Lu\n", logical, em->start,
                           em->start + em->len);
+               free_extent_map(em);
                return -EINVAL;
        }
 
@@ -4895,7 +4901,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        num_stripes = map->num_stripes;
                        max_errors = nr_parity_stripes(map);
 
-                       raid_map = kmalloc(sizeof(u64) * num_stripes,
+                       raid_map = kmalloc_array(num_stripes, sizeof(u64),
                                           GFP_NOFS);
                        if (!raid_map) {
                                ret = -ENOMEM;
@@ -5395,10 +5401,8 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
                .bi_rw = bio->bi_rw,
        };
 
-       if (bio->bi_vcnt == 0) {
-               WARN_ON(1);
+       if (WARN_ON(bio->bi_vcnt == 0))
                return 1;
-       }
 
        prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
        if (bio_sectors(bio) > max_sectors)
@@ -5631,10 +5635,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
        struct btrfs_device *dev;
        u64 tmp;
 
-       if (!devid && !fs_info) {
-               WARN_ON(1);
+       if (WARN_ON(!devid && !fs_info))
                return ERR_PTR(-EINVAL);
-       }
 
        dev = __alloc_device();
        if (IS_ERR(dev))
index b72f540..8b3cd14 100644 (file)
@@ -43,9 +43,8 @@ struct btrfs_device {
        /* WRITE_SYNC bios */
        struct btrfs_pending_bios pending_sync_bios;
 
-       int running_pending;
        u64 generation;
-
+       int running_pending;
        int writeable;
        int in_fs_metadata;
        int missing;
@@ -53,11 +52,11 @@ struct btrfs_device {
        int is_tgtdev_for_dev_replace;
 
        spinlock_t io_lock;
+       /* the mode sent to blkdev_get */
+       fmode_t mode;
 
        struct block_device *bdev;
 
-       /* the mode sent to blkdev_get */
-       fmode_t mode;
 
        struct rcu_string *name;
 
@@ -78,16 +77,21 @@ struct btrfs_device {
 
        /* optimal io width for this device */
        u32 io_width;
+       /* type and info about this device */
+       u64 type;
 
        /* minimal io size for this device */
        u32 sector_size;
 
-       /* type and info about this device */
-       u64 type;
 
        /* physical drive uuid (or lvm uuid) */
        u8 uuid[BTRFS_UUID_SIZE];
 
+       /* for sending down flush barriers */
+       int nobarriers;
+       struct bio *flush_bio;
+       struct completion flush_wait;
+
        /* per-device scrub information */
        struct scrub_ctx *scrub_device;
 
@@ -103,10 +107,6 @@ struct btrfs_device {
        struct radix_tree_root reada_zones;
        struct radix_tree_root reada_extents;
 
-       /* for sending down flush barriers */
-       struct bio *flush_bio;
-       struct completion flush_wait;
-       int nobarriers;
 
        /* disk I/O failure stats. For detailed description refer to
         * enum btrfs_dev_stat_values in ioctl.h */
@@ -132,7 +132,9 @@ struct btrfs_fs_devices {
 
        /* all of the devices in the FS, protected by a mutex
         * so we can safely walk it to write out the supers without
-        * worrying about add/remove by the multi-device code
+        * worrying about add/remove by the multi-device code.
+        * Scrubbing super can kick off supers writing by holding
+        * this mutex lock.
         */
        struct mutex device_list_mutex;
        struct list_head devices;
index 000eae2..2f6735d 100644 (file)
@@ -392,7 +392,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
 
                wait_for_completion(&ecr->completion);
                rc = ecr->rc;
-               INIT_COMPLETION(ecr->completion);
+               reinit_completion(&ecr->completion);
        }
 out:
        ablkcipher_request_free(req);
index efc85b1..3c6136f 100644 (file)
@@ -129,7 +129,7 @@ static int can_set_xattr(struct inode *inode, const char *name,
 
 static void hfsplus_init_header_node(struct inode *attr_file,
                                        u32 clump_size,
-                                       char *buf, size_t node_size)
+                                       char *buf, u16 node_size)
 {
        struct hfs_bnode_desc *desc;
        struct hfs_btree_header_rec *head;
@@ -139,8 +139,9 @@ static void hfsplus_init_header_node(struct inode *attr_file,
        char *bmp;
        u32 used_nodes;
        u32 used_bmp_bytes;
+       loff_t tmp;
 
-       hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %zu\n",
+       hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n",
                                clump_size, node_size);
 
        /* The end of the node contains list of record offsets */
@@ -154,7 +155,9 @@ static void hfsplus_init_header_node(struct inode *attr_file,
 
        head = (struct hfs_btree_header_rec *)(buf + offset);
        head->node_size = cpu_to_be16(node_size);
-       head->node_count = cpu_to_be32(i_size_read(attr_file) / node_size);
+       tmp = i_size_read(attr_file);
+       do_div(tmp, node_size);
+       head->node_count = cpu_to_be32(tmp);
        head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1);
        head->clump_size = cpu_to_be32(clump_size);
        head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS);
index c8e729d..74a7e12 100644 (file)
@@ -244,7 +244,7 @@ static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
        set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
        spin_lock(&tbl->slot_tbl_lock);
        if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
-               INIT_COMPLETION(tbl->complete);
+               reinit_completion(&tbl->complete);
                spin_unlock(&tbl->slot_tbl_lock);
                return wait_for_completion_interruptible(&tbl->complete);
        }
index 3a44a64..3407b2c 100644 (file)
@@ -1304,7 +1304,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
 {
        wait_for_completion(&mw->mw_complete);
        /* Re-arm the completion in case we want to wait on it again */
-       INIT_COMPLETION(mw->mw_complete);
+       reinit_completion(&mw->mw_complete);
        return mw->mw_status;
 }
 
@@ -1355,7 +1355,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
        else
                ret = mw->mw_status;
        /* Re-arm the completion in case we want to wait on it again */
-       INIT_COMPLETION(mw->mw_complete);
+       reinit_completion(&mw->mw_complete);
        return ret;
 }
 
index b701eaa..51942d5 100644 (file)
@@ -29,7 +29,6 @@ static int show_console_dev(struct seq_file *m, void *v)
        char flags[ARRAY_SIZE(con_flags) + 1];
        struct console *con = v;
        unsigned int a;
-       int len;
        dev_t dev = 0;
 
        if (con->device) {
@@ -47,11 +46,10 @@ static int show_console_dev(struct seq_file *m, void *v)
                        con_flags[a].name : ' ';
        flags[a] = 0;
 
-       seq_printf(m, "%s%d%n", con->name, con->index, &len);
-       len = 21 - len;
-       if (len < 1)
-               len = 1;
-       seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+       seq_setwidth(m, 21 - 1);
+       seq_printf(m, "%s%d", con->name, con->index);
+       seq_pad(m, ' ');
+       seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
                        con->write ? 'W' : '-', con->unblank ? 'U' : '-',
                        flags);
        if (dev)
index c805d5b..a77d2b2 100644 (file)
@@ -1,8 +1,8 @@
 #include <linux/fs.h>
-#include <linux/hugetlb.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
index ccfd99b..5f9bc8a 100644 (file)
@@ -39,7 +39,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
        unsigned long ino = 0;
        struct file *file;
        dev_t dev = 0;
-       int flags, len;
+       int flags;
 
        flags = region->vm_flags;
        file = region->vm_file;
@@ -50,8 +50,9 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
                ino = inode->i_ino;
        }
 
+       seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
        seq_printf(m,
-                  "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+                  "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
                   region->vm_start,
                   region->vm_end,
                   flags & VM_READ ? 'r' : '-',
@@ -59,13 +60,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
                   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
-                  MAJOR(dev), MINOR(dev), ino, &len);
+                  MAJOR(dev), MINOR(dev), ino);
 
        if (file) {
-               len = 25 + sizeof(void *) * 6 - len;
-               if (len < 1)
-                       len = 1;
-               seq_printf(m, "%*c", len, ' ');
+               seq_pad(m, ' ');
                seq_path(m, &file->f_path, "");
        }
 
index abbe825..fb52b54 100644 (file)
@@ -62,7 +62,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                total_rss << (PAGE_SHIFT-10),
                data << (PAGE_SHIFT-10),
                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+               (PTRS_PER_PTE * sizeof(pte_t) *
+                atomic_long_read(&mm->nr_ptes)) >> 10,
                swap << (PAGE_SHIFT-10));
 }
 
@@ -83,14 +84,6 @@ unsigned long task_statm(struct mm_struct *mm,
        return mm->total_vm;
 }
 
-static void pad_len_spaces(struct seq_file *m, int len)
-{
-       len = 25 + sizeof(void*) * 6 - len;
-       if (len < 1)
-               len = 1;
-       seq_printf(m, "%*c", len, ' ');
-}
-
 #ifdef CONFIG_NUMA
 /*
  * These functions are for numa_maps but called in generic **maps seq_file
@@ -268,7 +261,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
        unsigned long long pgoff = 0;
        unsigned long start, end;
        dev_t dev = 0;
-       int len;
        const char *name = NULL;
 
        if (file) {
@@ -286,7 +278,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
        if (stack_guard_page_end(vma, end))
                end -= PAGE_SIZE;
 
-       seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+       seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+       seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
                        start,
                        end,
                        flags & VM_READ ? 'r' : '-',
@@ -294,14 +287,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
                        flags & VM_EXEC ? 'x' : '-',
                        flags & VM_MAYSHARE ? 's' : 'p',
                        pgoff,
-                       MAJOR(dev), MINOR(dev), ino, &len);
+                       MAJOR(dev), MINOR(dev), ino);
 
        /*
         * Print the dentry name for named mappings, and a
         * special [heap] marker for the heap:
         */
        if (file) {
-               pad_len_spaces(m, len);
+               seq_pad(m, ' ');
                seq_path(m, &file->f_path, "\n");
                goto done;
        }
@@ -333,7 +326,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
                                name = "[stack]";
                        } else {
                                /* Thread stack in /proc/PID/maps */
-                               pad_len_spaces(m, len);
+                               seq_pad(m, ' ');
                                seq_printf(m, "[stack:%d]", tid);
                        }
                }
@@ -341,7 +334,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 
 done:
        if (name) {
-               pad_len_spaces(m, len);
+               seq_pad(m, ' ');
                seq_puts(m, name);
        }
        seq_putc(m, '\n');
@@ -505,9 +498,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (pmd_trans_huge_lock(pmd, vma) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
-               spin_unlock(&walk->mm->page_table_lock);
+               spin_unlock(ptl);
                mss->anonymous_thp += HPAGE_PMD_SIZE;
                return 0;
        }
@@ -998,13 +991,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 {
        struct vm_area_struct *vma;
        struct pagemapread *pm = walk->private;
+       spinlock_t *ptl;
        pte_t *pte;
        int err = 0;
        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
 
        /* find the first VMA at or above 'addr' */
        vma = find_vma(walk->mm, addr);
-       if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+       if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                int pmd_flags2;
 
                if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1022,7 +1016,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        if (err)
                                break;
                }
-               spin_unlock(&walk->mm->page_table_lock);
+               spin_unlock(ptl);
                return err;
        }
 
@@ -1324,7 +1318,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
        md = walk->private;
 
-       if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+       if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
 
@@ -1332,7 +1326,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
                if (page)
                        gather_stats(page, md, pte_dirty(huge_pte),
                                     HPAGE_PMD_SIZE/PAGE_SIZE);
-               spin_unlock(&walk->mm->page_table_lock);
+               spin_unlock(ptl);
                return 0;
        }
 
index 56123a6..678455d 100644 (file)
@@ -123,14 +123,6 @@ unsigned long task_statm(struct mm_struct *mm,
        return size;
 }
 
-static void pad_len_spaces(struct seq_file *m, int len)
-{
-       len = 25 + sizeof(void*) * 6 - len;
-       if (len < 1)
-               len = 1;
-       seq_printf(m, "%*c", len, ' ');
-}
-
 /*
  * display a single VMA to a sequenced file
  */
@@ -142,7 +134,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
        unsigned long ino = 0;
        struct file *file;
        dev_t dev = 0;
-       int flags, len;
+       int flags;
        unsigned long long pgoff = 0;
 
        flags = vma->vm_flags;
@@ -155,8 +147,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
                pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
        }
 
+       seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
        seq_printf(m,
-                  "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+                  "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
                   vma->vm_start,
                   vma->vm_end,
                   flags & VM_READ ? 'r' : '-',
@@ -164,16 +157,16 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
                   pgoff,
-                  MAJOR(dev), MINOR(dev), ino, &len);
+                  MAJOR(dev), MINOR(dev), ino);
 
        if (file) {
-               pad_len_spaces(m, len);
+               seq_pad(m, ' ');
                seq_path(m, &file->f_path, "");
        } else if (mm) {
                pid_t tid = vm_is_stack(priv->task, vma, is_pid);
 
                if (tid != 0) {
-                       pad_len_spaces(m, len);
+                       seq_pad(m, ' ');
                        /*
                         * Thread stack in /proc/PID/task/TID/maps or
                         * the main process stack.
index a290157..1cd2388 100644 (file)
@@ -766,6 +766,21 @@ int seq_write(struct seq_file *seq, const void *data, size_t len)
 }
 EXPORT_SYMBOL(seq_write);
 
+/**
+ * seq_pad - write padding spaces to buffer
+ * @m: seq_file identifying the buffer to which data should be written
+ * @c: the byte to append after padding if non-zero
+ */
+void seq_pad(struct seq_file *m, char c)
+{
+       int size = m->pad_until - m->count;
+       if (size > 0)
+               seq_printf(m, "%*s", size, "");
+       if (c)
+               seq_putc(m, c);
+}
+EXPORT_SYMBOL(seq_pad);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
        struct list_head *lh;
index 83e2c31..bc2121f 100644 (file)
 #define KERNEL_CTORS() . = ALIGN(8);                      \
                        VMLINUX_SYMBOL(__ctors_start) = .; \
                        *(.ctors)                          \
+                       *(.init_array)                     \
                        VMLINUX_SYMBOL(__ctors_end) = .;
 #else
 #define KERNEL_CTORS()
index 98e892e..a0f9280 100644 (file)
@@ -8,6 +8,8 @@
 #define CMDLINEPARSEH
 
 #include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
 
 /* partition flags */
 #define PF_RDONLY                   0x01 /* Device is read only */
index 22c33e3..5d5aaae 100644 (file)
@@ -19,8 +19,8 @@
  *
  * See also:  complete(), wait_for_completion() (and friends _timeout,
  * _interruptible, _interruptible_timeout, and _killable), init_completion(),
- * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
- * INIT_COMPLETION().
+ * reinit_completion(), and macros DECLARE_COMPLETION(),
+ * DECLARE_COMPLETION_ONSTACK().
  */
 struct completion {
        unsigned int done;
@@ -65,7 +65,7 @@ struct completion {
 
 /**
  * init_completion - Initialize a dynamically allocated completion
- * @x:  completion structure that is to be initialized
+ * @x:  pointer to completion structure that is to be initialized
  *
  * This inline function will initialize a dynamically created completion
  * structure.
@@ -76,6 +76,18 @@ static inline void init_completion(struct completion *x)
        init_waitqueue_head(&x->wait);
 }
 
+/**
+ * reinit_completion - reinitialize a completion structure
+ * @x:  pointer to completion structure that is to be reinitialized
+ *
+ * This inline function should be used to reinitialize a completion structure so it can
+ * be reused. This is especially important after complete_all() is used.
+ */
+static inline void reinit_completion(struct completion *x)
+{
+       x->done = 0;
+}
+
 extern void wait_for_completion(struct completion *);
 extern void wait_for_completion_io(struct completion *);
 extern int wait_for_completion_interruptible(struct completion *x);
@@ -94,14 +106,4 @@ extern bool completion_done(struct completion *x);
 extern void complete(struct completion *);
 extern void complete_all(struct completion *);
 
-/**
- * INIT_COMPLETION - reinitialize a completion structure
- * @x:  completion structure to be reinitialized
- *
- * This macro should be used to reinitialize a completion structure so it can
- * be reused. This is especially important after complete_all() is used.
- */
-#define INIT_COMPLETION(x)     ((x).done = 0)
-
-
 #endif
index 412cd50..3f2793d 100644 (file)
@@ -43,7 +43,7 @@ extern struct module __this_module;
 /* Mark the CRC weak since genksyms apparently decides not to
  * generate a checksums for some symbols */
 #define __CRC_SYMBOL(sym, sec)                                 \
-       extern void *__crc_##sym __attribute__((weak));         \
+       extern __visible void *__crc_##sym __attribute__((weak));               \
        static const unsigned long __kcrctab_##sym              \
        __used                                                  \
        __attribute__((section("___kcrctab" sec "+" #sym), unused))     \
@@ -59,7 +59,7 @@ extern struct module __this_module;
        static const char __kstrtab_##sym[]                     \
        __attribute__((section("__ksymtab_strings"), aligned(1))) \
        = VMLINUX_SYMBOL_STR(sym);                              \
-       static const struct kernel_symbol __ksymtab_##sym       \
+       __visible const struct kernel_symbol __ksymtab_##sym    \
        __used                                                  \
        __attribute__((section("___ksymtab" sec "+" #sym), unused))     \
        = { (unsigned long)&sym, __kstrtab_##sym }
index 3935428..91672e2 100644 (file)
@@ -54,7 +54,8 @@ enum page_check_address_pmd_flag {
 extern pmd_t *page_check_address_pmd(struct page *page,
                                     struct mm_struct *mm,
                                     unsigned long address,
-                                    enum page_check_address_pmd_flag flag);
+                                    enum page_check_address_pmd_flag flag,
+                                    spinlock_t **ptl);
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -129,15 +130,15 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
                                    unsigned long start,
                                    unsigned long end,
                                    long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd,
-                                struct vm_area_struct *vma);
+extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+               spinlock_t **ptl);
 /* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
-                                     struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+               spinlock_t **ptl)
 {
        VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
        if (pmd_trans_huge(*pmd))
-               return __pmd_trans_huge_lock(pmd, vma);
+               return __pmd_trans_huge_lock(pmd, vma, ptl);
        else
                return 0;
 }
@@ -215,8 +216,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         long adjust_next)
 {
 }
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
-                                     struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+               spinlock_t **ptl)
 {
        return 0;
 }
index 0393270..acd2010 100644 (file)
@@ -392,6 +392,15 @@ static inline int hugepage_migration_support(struct hstate *h)
        return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT);
 }
 
+static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
+                                          struct mm_struct *mm, pte_t *pte)
+{
+       if (huge_page_size(h) == PMD_SIZE)
+               return pmd_lockptr(mm, (pmd_t *) pte);
+       VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
+       return &mm->page_table_lock;
+}
+
 #else  /* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page_node(h, nid) NULL
@@ -401,6 +410,7 @@ struct hstate {};
 #define hstate_sizelog(s) NULL
 #define hstate_vma(v) NULL
 #define hstate_inode(i) NULL
+#define page_hstate(page) NULL
 #define huge_page_size(h) PAGE_SIZE
 #define huge_page_mask(h) PAGE_MASK
 #define vma_kernel_pagesize(v) PAGE_SIZE
@@ -421,6 +431,22 @@ static inline pgoff_t basepage_index(struct page *page)
 #define dissolve_free_huge_pages(s, e) do {} while (0)
 #define pmd_huge_support()     0
 #define hugepage_migration_support(h)  0
+
+static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
+                                          struct mm_struct *mm, pte_t *pte)
+{
+       return &mm->page_table_lock;
+}
 #endif /* CONFIG_HUGETLB_PAGE */
 
+static inline spinlock_t *huge_pte_lock(struct hstate *h,
+                                       struct mm_struct *mm, pte_t *pte)
+{
+       spinlock_t *ptl;
+
+       ptl = huge_pte_lockptr(h, mm, pte);
+       spin_lock(ptl);
+       return ptl;
+}
+
 #endif /* _LINUX_HUGETLB_H */
index c9e831d..db43b58 100644 (file)
@@ -11,8 +11,6 @@
 #include <linux/irqnr.h>
 #include <linux/hardirq.h>
 #include <linux/irqflags.h>
-#include <linux/smp.h>
-#include <linux/percpu.h>
 #include <linux/hrtimer.h>
 #include <linux/kref.h>
 #include <linux/workqueue.h>
@@ -392,15 +390,6 @@ extern void __raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
-/* This is the worklist that queues up per-cpu softirq work.
- *
- * send_remote_sendirq() adds work to these lists, and
- * the softirq handler itself dequeues from them.  The queues
- * are protected by disabling local cpu interrupts and they must
- * only be accessed by the local cpu that they are for.
- */
-DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
-
 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
 
 static inline struct task_struct *this_cpu_ksoftirqd(void)
@@ -408,17 +397,6 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
        return this_cpu_read(ksoftirqd);
 }
 
-/* Try to send a softirq to a remote cpu.  If this cannot be done, the
- * work will be queued to the local cpu.
- */
-extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq);
-
-/* Like send_remote_softirq(), but the caller must disable local cpu interrupts
- * and compute the current cpu, passed in as 'this_cpu'.
- */
-extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
-                                 int this_cpu, int softirq);
-
 /* Tasklets --- multithreaded analogue of BHs.
 
    Main feature differing them of generic softirqs: tasklet
index 7ea319e..a444c79 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/types.h>
+#include <trace/events/iommu.h>
 
 #define IOMMU_READ     (1)
 #define IOMMU_WRITE    (2)
@@ -227,6 +228,7 @@ static inline int report_iommu_fault(struct iommu_domain *domain,
                ret = domain->handler(domain, dev, iova, flags,
                                                domain->handler_token);
 
+       trace_io_page_fault(dev, iova, flags);
        return ret;
 }
 
index 10308c6..552d51e 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * A generic kernel FIFO implementation
  *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
+ * Copyright (C) 2013 Stefani Seibold <stefani@seibold.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -67,9 +67,10 @@ struct __kfifo {
        union { \
                struct __kfifo  kfifo; \
                datatype        *type; \
+               const datatype  *const_type; \
                char            (*rectype)[recsize]; \
                ptrtype         *ptr; \
-               const ptrtype   *ptr_const; \
+               ptrtype const   *ptr_const; \
        }
 
 #define __STRUCT_KFIFO(type, size, recsize, ptrtype) \
@@ -386,16 +387,12 @@ __kfifo_int_must_check_helper( \
 #define        kfifo_put(fifo, val) \
 ({ \
        typeof((fifo) + 1) __tmp = (fifo); \
-       typeof((val) + 1) __val = (val); \
+       typeof(*__tmp->const_type) __val = (val); \
        unsigned int __ret; \
-       const size_t __recsize = sizeof(*__tmp->rectype); \
+       size_t __recsize = sizeof(*__tmp->rectype); \
        struct __kfifo *__kfifo = &__tmp->kfifo; \
-       if (0) { \
-               typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \
-               __dummy = (typeof(__val))NULL; \
-       } \
        if (__recsize) \
-               __ret = __kfifo_in_r(__kfifo, __val, sizeof(*__val), \
+               __ret = __kfifo_in_r(__kfifo, &__val, sizeof(__val), \
                        __recsize); \
        else { \
                __ret = !kfifo_is_full(__tmp); \
@@ -404,7 +401,7 @@ __kfifo_int_must_check_helper( \
                        ((typeof(__tmp->type))__kfifo->data) : \
                        (__tmp->buf) \
                        )[__kfifo->in & __tmp->kfifo.mask] = \
-                               *(typeof(__tmp->type))__val; \
+                               (typeof(*__tmp->type))__val; \
                        smp_wmb(); \
                        __kfifo->in++; \
                } \
@@ -415,7 +412,7 @@ __kfifo_int_must_check_helper( \
 /**
  * kfifo_get - get data from the fifo
  * @fifo: address of the fifo to be used
- * @val: the var where to store the data to be added
+ * @val: address where to store the data
  *
  * This macro reads the data from the fifo.
  * It returns 0 if the fifo was empty. Otherwise it returns the number
@@ -428,12 +425,10 @@ __kfifo_int_must_check_helper( \
 __kfifo_uint_must_check_helper( \
 ({ \
        typeof((fifo) + 1) __tmp = (fifo); \
-       typeof((val) + 1) __val = (val); \
+       typeof(__tmp->ptr) __val = (val); \
        unsigned int __ret; \
        const size_t __recsize = sizeof(*__tmp->rectype); \
        struct __kfifo *__kfifo = &__tmp->kfifo; \
-       if (0) \
-               __val = (typeof(__tmp->ptr))0; \
        if (__recsize) \
                __ret = __kfifo_out_r(__kfifo, __val, sizeof(*__val), \
                        __recsize); \
@@ -456,7 +451,7 @@ __kfifo_uint_must_check_helper( \
 /**
  * kfifo_peek - get data from the fifo without removing
  * @fifo: address of the fifo to be used
- * @val: the var where to store the data to be added
+ * @val: address where to store the data
  *
  * This reads the data from the fifo without removing it from the fifo.
  * It returns 0 if the fifo was empty. Otherwise it returns the number
@@ -469,12 +464,10 @@ __kfifo_uint_must_check_helper( \
 __kfifo_uint_must_check_helper( \
 ({ \
        typeof((fifo) + 1) __tmp = (fifo); \
-       typeof((val) + 1) __val = (val); \
+       typeof(__tmp->ptr) __val = (val); \
        unsigned int __ret; \
        const size_t __recsize = sizeof(*__tmp->rectype); \
        struct __kfifo *__kfifo = &__tmp->kfifo; \
-       if (0) \
-               __val = (typeof(__tmp->ptr))NULL; \
        if (__recsize) \
                __ret = __kfifo_out_peek_r(__kfifo, __val, sizeof(*__val), \
                        __recsize); \
@@ -508,14 +501,10 @@ __kfifo_uint_must_check_helper( \
 #define        kfifo_in(fifo, buf, n) \
 ({ \
        typeof((fifo) + 1) __tmp = (fifo); \
-       typeof((buf) + 1) __buf = (buf); \
+       typeof(__tmp->ptr_const) __buf = (buf); \
        unsigned long __n = (n); \
        const size_t __recsize = sizeof(*__tmp->rectype); \
        struct __kfifo *__kfifo = &__tmp->kfifo; \
-       if (0) { \
-               typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \
-               __dummy = (typeof(__buf))NULL; \
-       } \
        (__recsize) ?\
        __kfifo_in_r(__kfifo, __buf, __n, __recsize) : \
        __kfifo_in(__kfifo, __buf, __n); \
@@ -561,14 +550,10 @@ __kfifo_uint_must_check_helper( \
 __kfifo_uint_must_check_helper( \
 ({ \
        typeof((fifo) + 1) __tmp = (fifo); \
-       typeof((buf) + 1) __buf = (buf); \
+       typeof(__tmp->ptr) __buf = (buf); \
        unsigned long __n = (n); \
        const size_t __recsize = sizeof(*__tmp->rectype); \
        struct __kfifo *__kfifo = &__tmp->kfifo; \
-       if (0) { \
-               typeof(__tmp->ptr) __dummy = NULL; \
-               __buf = __dummy; \
-       } \
        (__recsize) ?\
        __kfifo_out_r(__kfifo, __buf, __n, __recsize) : \
        __kfifo_out(__kfifo, __buf, __n); \
@@ -773,14 +758,10 @@ __kfifo_uint_must_check_helper( \
 __kfifo_uint_must_check_helper( \
 ({ \
        typeof((fifo) + 1) __tmp = (fifo); \
-       typeof((buf) + 1) __buf = (buf); \
+       typeof(__tmp->ptr) __buf = (buf); \
        unsigned long __n = (n); \
        const size_t __recsize = sizeof(*__tmp->rectype); \
        struct __kfifo *__kfifo = &__tmp->kfifo; \
-       if (0) { \
-               typeof(__tmp->ptr) __dummy __attribute__ ((unused)) = NULL; \
-               __buf = __dummy; \
-       } \
        (__recsize) ? \
        __kfifo_out_peek_r(__kfifo, __buf, __n, __recsize) : \
        __kfifo_out_peek(__kfifo, __buf, __n); \
index 0fbbc7a..9523d2a 100644 (file)
@@ -142,7 +142,7 @@ struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
 
-extern raw_spinlock_t kvm_lock;
+extern spinlock_t kvm_lock;
 extern struct list_head vm_list;
 
 struct kvm_io_range {
@@ -189,8 +189,7 @@ struct kvm_async_pf {
        gva_t gva;
        unsigned long addr;
        struct kvm_arch_async_pf arch;
-       struct page *page;
-       bool done;
+       bool   wakeup_all;
 };
 
 void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
@@ -508,9 +507,10 @@ int kvm_set_memory_region(struct kvm *kvm,
                          struct kvm_userspace_memory_region *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
                            struct kvm_userspace_memory_region *mem);
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont);
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                           unsigned long npages);
 void kvm_arch_memslots_updated(struct kvm *kvm);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
@@ -671,6 +671,25 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
+#ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm);
+#else
+static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+}
+
+static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+}
+
+static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+       return false;
+}
+#endif
+
 static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_WQP
@@ -747,9 +766,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
-/* For vcpu->arch.iommu_flags */
-#define KVM_IOMMU_CACHE_COHERENCY      0x1
-
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
@@ -789,7 +805,7 @@ static inline void kvm_guest_enter(void)
 
        /* KVM does not hold any references to rcu protected data when it
         * switches CPU into a guest mode. In fact switching to a guest mode
-        * is very similar to exiting to userspase from rcu point of view. In
+        * is very similar to exiting to userspace from rcu point of view. In
         * addition CPU may stay in a guest mode for quite a long time (up to
         * one time slice). Lets treat guest mode as quiescent state, just like
         * we do with user-mode execution.
@@ -842,13 +858,6 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
        return gfn_to_memslot(kvm, gfn)->id;
 }
 
-static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
-{
-       /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
-       return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
-               (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-}
-
 static inline gfn_t
 hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
 {
@@ -1066,6 +1075,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp);
 
 extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
+extern struct kvm_device_ops kvm_vfio_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
index 8828a78..fbf10a0 100644 (file)
@@ -195,4 +195,6 @@ static inline struct llist_node *llist_del_all(struct llist_head *head)
 
 extern struct llist_node *llist_del_first(struct llist_head *head);
 
+struct llist_node *llist_reverse_order(struct llist_node *head);
+
 #endif /* LLIST_H */
index 13dfd36..c8929c3 100644 (file)
  */
 
 #include <linux/spinlock.h>
+#include <generated/bounds.h>
+
+#define USE_CMPXCHG_LOCKREF \
+       (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \
+        IS_ENABLED(CONFIG_SMP) && !BLOATED_SPINLOCKS)
 
 struct lockref {
        union {
-#ifdef CONFIG_CMPXCHG_LOCKREF
+#if USE_CMPXCHG_LOCKREF
                aligned_u64 lock_count;
 #endif
                struct {
index 42a35d9..0548eb2 100644 (file)
@@ -1316,32 +1316,85 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
-#if USE_SPLIT_PTLOCKS
-/*
- * We tuck a spinlock to guard each pagetable page into its struct page,
- * at page->private, with BUILD_BUG_ON to make sure that this will not
- * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
- * When freeing, reset page->mapping so free_pages_check won't complain.
- */
-#define __pte_lockptr(page)    &((page)->ptl)
-#define pte_lock_init(_page)   do {                                    \
-       spin_lock_init(__pte_lockptr(_page));                           \
-} while (0)
-#define pte_lock_deinit(page)  ((page)->mapping = NULL)
-#define pte_lockptr(mm, pmd)   ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
-#else  /* !USE_SPLIT_PTLOCKS */
+#if USE_SPLIT_PTE_PTLOCKS
+#if BLOATED_SPINLOCKS
+void __init ptlock_cache_init(void);
+extern bool ptlock_alloc(struct page *page);
+extern void ptlock_free(struct page *page);
+
+static inline spinlock_t *ptlock_ptr(struct page *page)
+{
+       return page->ptl;
+}
+#else /* BLOATED_SPINLOCKS */
+static inline void ptlock_cache_init(void) {}
+static inline bool ptlock_alloc(struct page *page)
+{
+       return true;
+}
+
+static inline void ptlock_free(struct page *page)
+{
+}
+
+static inline spinlock_t *ptlock_ptr(struct page *page)
+{
+       return &page->ptl;
+}
+#endif /* BLOATED_SPINLOCKS */
+
+static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
+{
+       return ptlock_ptr(pmd_page(*pmd));
+}
+
+static inline bool ptlock_init(struct page *page)
+{
+       /*
+        * prep_new_page() initialize page->private (and therefore page->ptl)
+        * with 0. Make sure nobody took it in use in between.
+        *
+        * It can happen if arch try to use slab for page table allocation:
+        * slab code uses page->slab_cache and page->first_page (for tail
+        * pages), which share storage with page->ptl.
+        */
+       VM_BUG_ON(*(unsigned long *)&page->ptl);
+       if (!ptlock_alloc(page))
+               return false;
+       spin_lock_init(ptlock_ptr(page));
+       return true;
+}
+
+/* Reset page->mapping so free_pages_check won't complain. */
+static inline void pte_lock_deinit(struct page *page)
+{
+       page->mapping = NULL;
+       ptlock_free(page);
+}
+
+#else  /* !USE_SPLIT_PTE_PTLOCKS */
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
-#define pte_lock_init(page)    do {} while (0)
-#define pte_lock_deinit(page)  do {} while (0)
-#define pte_lockptr(mm, pmd)   ({(void)(pmd); &(mm)->page_table_lock;})
-#endif /* USE_SPLIT_PTLOCKS */
+static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
+{
+       return &mm->page_table_lock;
+}
+static inline void ptlock_cache_init(void) {}
+static inline bool ptlock_init(struct page *page) { return true; }
+static inline void pte_lock_deinit(struct page *page) {}
+#endif /* USE_SPLIT_PTE_PTLOCKS */
+
+static inline void pgtable_init(void)
+{
+       ptlock_cache_init();
+       pgtable_cache_init();
+}
 
-static inline void pgtable_page_ctor(struct page *page)
+static inline bool pgtable_page_ctor(struct page *page)
 {
-       pte_lock_init(page);
        inc_zone_page_state(page, NR_PAGETABLE);
+       return ptlock_init(page);
 }
 
 static inline void pgtable_page_dtor(struct page *page)
@@ -1378,6 +1431,52 @@ static inline void pgtable_page_dtor(struct page *page)
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
                NULL: pte_offset_kernel(pmd, address))
 
+#if USE_SPLIT_PMD_PTLOCKS
+
+static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
+{
+       return ptlock_ptr(virt_to_page(pmd));
+}
+
+static inline bool pgtable_pmd_page_ctor(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       page->pmd_huge_pte = NULL;
+#endif
+       return ptlock_init(page);
+}
+
+static inline void pgtable_pmd_page_dtor(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       VM_BUG_ON(page->pmd_huge_pte);
+#endif
+       ptlock_free(page);
+}
+
+#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
+
+#else
+
+static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
+{
+       return &mm->page_table_lock;
+}
+
+static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; }
+static inline void pgtable_pmd_page_dtor(struct page *page) {}
+
+#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
+
+#endif
+
+static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
+{
+       spinlock_t *ptl = pmd_lockptr(mm, pmd);
+       spin_lock(ptl);
+       return ptl;
+}
+
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
                unsigned long zone_start_pfn, unsigned long *zholes_size);
index a3198e5..10f5a72 100644 (file)
@@ -23,7 +23,9 @@
 
 struct address_space;
 
-#define USE_SPLIT_PTLOCKS      (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
+#define USE_SPLIT_PTE_PTLOCKS  (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
+#define USE_SPLIT_PMD_PTLOCKS  (USE_SPLIT_PTE_PTLOCKS && \
+               IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
 
 /*
  * Each physical page in the system has a struct page associated with
@@ -63,6 +65,9 @@ struct page {
                                                 * this page is only used to
                                                 * free other pages.
                                                 */
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
+               pgtable_t pmd_huge_pte; /* protected by page->ptl */
+#endif
                };
 
                union {
@@ -141,8 +146,12 @@ struct page {
                                                 * indicates order in the buddy
                                                 * system if PG_buddy is set.
                                                 */
-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
+#if BLOATED_SPINLOCKS
+               spinlock_t *ptl;
+#else
                spinlock_t ptl;
+#endif
 #endif
                struct kmem_cache *slab_cache;  /* SL[AU]B: Pointer to slab */
                struct page *first_page;        /* Compound tail pages */
@@ -309,14 +318,14 @@ enum {
        NR_MM_COUNTERS
 };
 
-#if USE_SPLIT_PTLOCKS && defined(CONFIG_MMU)
+#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
 #define SPLIT_RSS_COUNTING
 /* per-thread cached information, */
 struct task_rss_stat {
        int events;     /* for synchronization threshold */
        int count[NR_MM_COUNTERS];
 };
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */
 
 struct mm_rss_stat {
        atomic_long_t count[NR_MM_COUNTERS];
@@ -339,6 +348,7 @@ struct mm_struct {
        pgd_t * pgd;
        atomic_t mm_users;                      /* How many users with user space? */
        atomic_t mm_count;                      /* How many references to "struct mm_struct" (users count as 1) */
+       atomic_long_t nr_ptes;                  /* Page table pages */
        int map_count;                          /* number of VMAs */
 
        spinlock_t page_table_lock;             /* Protects page tables and some counters */
@@ -360,7 +370,6 @@ struct mm_struct {
        unsigned long exec_vm;          /* VM_EXEC & ~VM_WRITE */
        unsigned long stack_vm;         /* VM_GROWSUP/DOWN */
        unsigned long def_flags;
-       unsigned long nr_ptes;          /* Page table pages */
        unsigned long start_code, end_code, start_data, end_data;
        unsigned long start_brk, brk, start_stack;
        unsigned long arg_start, arg_end, env_start, env_end;
@@ -406,7 +415,7 @@ struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
        struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        pgtable_t pmd_huge_pte; /* protected by page_table_lock */
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
index 05f2447..15cd6b1 100644 (file)
@@ -367,9 +367,6 @@ struct module
        /* What modules do I depend on? */
        struct list_head target_list;
 
-       /* Who is waiting for us to be unloaded */
-       struct task_struct *waiter;
-
        /* Destruction function. */
        void (*exit)(void);
 
index f7efc86..6f7ffa4 100644 (file)
@@ -286,6 +286,14 @@ static inline void lockup_detector_init(void)
 }
 #endif
 
+#ifdef CONFIG_DETECT_HUNG_TASK
+void reset_hung_task_detector(void);
+#else
+static inline void reset_hung_task_detector(void)
+{
+}
+#endif
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched                __attribute__((__section__(".sched.text")))
 
index 4e32edc..52e0097 100644 (file)
@@ -20,6 +20,7 @@ struct seq_file {
        size_t size;
        size_t from;
        size_t count;
+       size_t pad_until;
        loff_t index;
        loff_t read_pos;
        u64 version;
@@ -79,6 +80,20 @@ static inline void seq_commit(struct seq_file *m, int num)
        }
 }
 
+/**
+ * seq_setwidth - set padding width
+ * @m: the seq_file handle
+ * @size: the max number of bytes to pad.
+ *
+ * Call seq_setwidth() for setting max width, then call seq_printf() etc. and
+ * finally call seq_pad() to pad the remaining bytes.
+ */
+static inline void seq_setwidth(struct seq_file *m, size_t size)
+{
+       m->pad_until = m->count + size;
+}
+void seq_pad(struct seq_file *m, char c);
+
 char *mangle_path(char *s, const char *p, const char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
index 731f523..5da22ee 100644 (file)
@@ -49,6 +49,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
                smp_call_func_t func, void *info, bool wait,
                gfp_t gfp_flags);
 
+void __smp_call_function_single(int cpuid, struct call_single_data *data,
+                               int wait);
+
 #ifdef CONFIG_SMP
 
 #include <linux/preempt.h>
@@ -95,9 +98,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait);
 void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait);
 
-void __smp_call_function_single(int cpuid, struct call_single_data *data,
-                               int wait);
-
 int smp_call_function_any(const struct cpumask *mask,
                          smp_call_func_t func, void *info, int wait);
 
@@ -106,14 +106,10 @@ void kick_all_cpus_sync(void);
 /*
  * Generic and arch helpers
  */
-#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 void __init call_function_init(void);
 void generic_smp_call_function_single_interrupt(void);
 #define generic_smp_call_function_interrupt \
        generic_smp_call_function_single_interrupt
-#else
-static inline void call_function_init(void) { }
-#endif
 
 /*
  * Mark the boot cpu "online" so that it can call console drivers in
@@ -155,12 +151,6 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
 
 static inline void kick_all_cpus_sync(void) {  }
 
-static inline void __smp_call_function_single(int cpuid,
-               struct call_single_data *data, int wait)
-{
-       on_each_cpu(data->func, data->info, wait);
-}
-
 #endif /* !SMP */
 
 /*
index c114614..9b058ee 100644 (file)
@@ -237,4 +237,18 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
        __srcu_read_unlock(sp, idx);
 }
 
+/**
+ * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
+ *
+ * Converts the preceding srcu_read_unlock into a two-way memory barrier.
+ *
+ * Call this after srcu_read_unlock, to guarantee that all memory operations
+ * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
+ * the preceding srcu_read_unlock.
+ */
+static inline void smp_mb__after_srcu_read_unlock(void)
+{
+       /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
+}
+
 #endif
index 8d4fa82..c0f7526 100644 (file)
@@ -139,7 +139,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
 
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address);
-extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte);
+extern void migration_entry_wait_huge(struct vm_area_struct *vma,
+               struct mm_struct *mm, pte_t *pte);
 #else
 
 #define make_migration_entry(page, write) swp_entry(0, 0)
@@ -151,8 +152,8 @@ static inline int is_migration_entry(swp_entry_t swp)
 static inline void make_migration_entry_read(swp_entry_t *entryp) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                         unsigned long address) { }
-static inline void migration_entry_wait_huge(struct mm_struct *mm,
-                                       pte_t *pte) { }
+static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
+               struct mm_struct *mm, pte_t *pte) { }
 static inline int is_write_migration_entry(swp_entry_t entry)
 {
        return 0;
index 36d36cc..e4abb84 100644 (file)
@@ -51,11 +51,11 @@ int virtqueue_add_sgs(struct virtqueue *vq,
                      void *data,
                      gfp_t gfp);
 
-void virtqueue_kick(struct virtqueue *vq);
+bool virtqueue_kick(struct virtqueue *vq);
 
 bool virtqueue_kick_prepare(struct virtqueue *vq);
 
-void virtqueue_notify(struct virtqueue *vq);
+bool virtqueue_notify(struct virtqueue *vq);
 
 void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
 
@@ -73,6 +73,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *vq);
 
 unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
 
+bool virtqueue_is_broken(struct virtqueue *vq);
+
 /**
  * virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
index 29b9104..e8f8f71 100644 (file)
@@ -96,33 +96,6 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
        return test_bit(fbit, vdev->features);
 }
 
-/**
- * virtio_config_val - look for a feature and get a virtio config entry.
- * @vdev: the virtio device
- * @fbit: the feature bit
- * @offset: the type to search for.
- * @v: a pointer to the value to fill in.
- *
- * The return value is -ENOENT if the feature doesn't exist.  Otherwise
- * the config value is copied into whatever is pointed to by v. */
-#define virtio_config_val(vdev, fbit, offset, v) \
-       virtio_config_buf((vdev), (fbit), (offset), (v), sizeof(*v))
-
-#define virtio_config_val_len(vdev, fbit, offset, v, len) \
-       virtio_config_buf((vdev), (fbit), (offset), (v), (len))
-
-static inline int virtio_config_buf(struct virtio_device *vdev,
-                                   unsigned int fbit,
-                                   unsigned int offset,
-                                   void *buf, unsigned len)
-{
-       if (!virtio_has_feature(vdev, fbit))
-               return -ENOENT;
-
-       vdev->config->get(vdev, offset, buf, len);
-       return 0;
-}
-
 static inline
 struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
                                        vq_callback_t *c, const char *n)
@@ -162,5 +135,139 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
        return 0;
 }
 
+/* Config space accessors. */
+#define virtio_cread(vdev, structname, member, ptr)                    \
+       do {                                                            \
+               /* Must match the member's type, and be integer */      \
+               if (!typecheck(typeof((((structname*)0)->member)), *(ptr))) \
+                       (*ptr) = 1;                                     \
+                                                                       \
+               switch (sizeof(*ptr)) {                                 \
+               case 1:                                                 \
+                       *(ptr) = virtio_cread8(vdev,                    \
+                                              offsetof(structname, member)); \
+                       break;                                          \
+               case 2:                                                 \
+                       *(ptr) = virtio_cread16(vdev,                   \
+                                               offsetof(structname, member)); \
+                       break;                                          \
+               case 4:                                                 \
+                       *(ptr) = virtio_cread32(vdev,                   \
+                                               offsetof(structname, member)); \
+                       break;                                          \
+               case 8:                                                 \
+                       *(ptr) = virtio_cread64(vdev,                   \
+                                               offsetof(structname, member)); \
+                       break;                                          \
+               default:                                                \
+                       BUG();                                          \
+               }                                                       \
+       } while(0)
+
+/* Config space accessors. */
+#define virtio_cwrite(vdev, structname, member, ptr)                   \
+       do {                                                            \
+               /* Must match the member's type, and be integer */      \
+               if (!typecheck(typeof((((structname*)0)->member)), *(ptr))) \
+                       BUG_ON((*ptr) == 1);                            \
+                                                                       \
+               switch (sizeof(*ptr)) {                                 \
+               case 1:                                                 \
+                       virtio_cwrite8(vdev,                            \
+                                      offsetof(structname, member),    \
+                                      *(ptr));                         \
+                       break;                                          \
+               case 2:                                                 \
+                       virtio_cwrite16(vdev,                           \
+                                       offsetof(structname, member),   \
+                                       *(ptr));                        \
+                       break;                                          \
+               case 4:                                                 \
+                       virtio_cwrite32(vdev,                           \
+                                       offsetof(structname, member),   \
+                                       *(ptr));                        \
+                       break;                                          \
+               case 8:                                                 \
+                       virtio_cwrite64(vdev,                           \
+                                       offsetof(structname, member),   \
+                                       *(ptr));                        \
+                       break;                                          \
+               default:                                                \
+                       BUG();                                          \
+               }                                                       \
+       } while(0)
+
+static inline u8 virtio_cread8(struct virtio_device *vdev, unsigned int offset)
+{
+       u8 ret;
+       vdev->config->get(vdev, offset, &ret, sizeof(ret));
+       return ret;
+}
+
+static inline void virtio_cread_bytes(struct virtio_device *vdev,
+                                     unsigned int offset,
+                                     void *buf, size_t len)
+{
+       vdev->config->get(vdev, offset, buf, len);
+}
+
+static inline void virtio_cwrite8(struct virtio_device *vdev,
+                                 unsigned int offset, u8 val)
+{
+       vdev->config->set(vdev, offset, &val, sizeof(val));
+}
+
+static inline u16 virtio_cread16(struct virtio_device *vdev,
+                                unsigned int offset)
+{
+       u16 ret;
+       vdev->config->get(vdev, offset, &ret, sizeof(ret));
+       return ret;
+}
+
+static inline void virtio_cwrite16(struct virtio_device *vdev,
+                                  unsigned int offset, u16 val)
+{
+       vdev->config->set(vdev, offset, &val, sizeof(val));
+}
+
+static inline u32 virtio_cread32(struct virtio_device *vdev,
+                                unsigned int offset)
+{
+       u32 ret;
+       vdev->config->get(vdev, offset, &ret, sizeof(ret));
+       return ret;
+}
+
+static inline void virtio_cwrite32(struct virtio_device *vdev,
+                                  unsigned int offset, u32 val)
+{
+       vdev->config->set(vdev, offset, &val, sizeof(val));
+}
+
+static inline u64 virtio_cread64(struct virtio_device *vdev,
+                                unsigned int offset)
+{
+       u64 ret;
+       vdev->config->get(vdev, offset, &ret, sizeof(ret));
+       return ret;
+}
+
+static inline void virtio_cwrite64(struct virtio_device *vdev,
+                                  unsigned int offset, u64 val)
+{
+       vdev->config->set(vdev, offset, &val, sizeof(val));
+}
+
+/* Conditional config space accessors. */
+#define virtio_cread_feature(vdev, fbit, structname, member, ptr)      \
+       ({                                                              \
+               int _r = 0;                                             \
+               if (!virtio_has_feature(vdev, fbit))                    \
+                       _r = -ENOENT;                                   \
+               else                                                    \
+                       virtio_cread((vdev), structname, member, ptr);  \
+               _r;                                                     \
+       })
 
 #endif /* _LINUX_VIRTIO_CONFIG_H */
index b300787..67e06fe 100644 (file)
@@ -71,7 +71,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
                                      struct virtio_device *vdev,
                                      bool weak_barriers,
                                      void *pages,
-                                     void (*notify)(struct virtqueue *vq),
+                                     bool (*notify)(struct virtqueue *vq),
                                      void (*callback)(struct virtqueue *vq),
                                      const char *name);
 void vring_del_virtqueue(struct virtqueue *vq);
diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
new file mode 100644 (file)
index 0000000..a8f5c32
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+ * iommu trace points
+ *
+ * Copyright (C) 2013 Shuah Khan <shuah.kh@samsung.com>
+ *
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iommu
+
+#if !defined(_TRACE_IOMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IOMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/pci.h>
+
+struct device;
+
+DECLARE_EVENT_CLASS(iommu_group_event,
+
+       TP_PROTO(int group_id, struct device *dev),
+
+       TP_ARGS(group_id, dev),
+
+       TP_STRUCT__entry(
+               __field(int, gid)
+               __string(device, dev_name(dev))
+       ),
+
+       TP_fast_assign(
+               __entry->gid = group_id;
+               __assign_str(device, dev_name(dev));
+       ),
+
+       TP_printk("IOMMU: groupID=%d device=%s",
+                       __entry->gid, __get_str(device)
+       )
+);
+
+DEFINE_EVENT(iommu_group_event, add_device_to_group,
+
+       TP_PROTO(int group_id, struct device *dev),
+
+       TP_ARGS(group_id, dev)
+
+);
+
+DEFINE_EVENT(iommu_group_event, remove_device_from_group,
+
+       TP_PROTO(int group_id, struct device *dev),
+
+       TP_ARGS(group_id, dev)
+);
+
+DECLARE_EVENT_CLASS(iommu_device_event,
+
+       TP_PROTO(struct device *dev),
+
+       TP_ARGS(dev),
+
+       TP_STRUCT__entry(
+               __string(device, dev_name(dev))
+       ),
+
+       TP_fast_assign(
+               __assign_str(device, dev_name(dev));
+       ),
+
+       TP_printk("IOMMU: device=%s", __get_str(device)
+       )
+);
+
+DEFINE_EVENT(iommu_device_event, attach_device_to_domain,
+
+       TP_PROTO(struct device *dev),
+
+       TP_ARGS(dev)
+);
+
+DEFINE_EVENT(iommu_device_event, detach_device_from_domain,
+
+       TP_PROTO(struct device *dev),
+
+       TP_ARGS(dev)
+);
+
+DECLARE_EVENT_CLASS(iommu_map_unmap,
+
+       TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size),
+
+       TP_ARGS(iova, paddr, size),
+
+       TP_STRUCT__entry(
+               __field(u64, iova)
+               __field(u64, paddr)
+               __field(int, size)
+       ),
+
+       TP_fast_assign(
+               __entry->iova = iova;
+               __entry->paddr = paddr;
+               __entry->size = size;
+       ),
+
+       TP_printk("IOMMU: iova=0x%016llx paddr=0x%016llx size=0x%x",
+                       __entry->iova, __entry->paddr, __entry->size
+       )
+);
+
+DEFINE_EVENT(iommu_map_unmap, map,
+
+       TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size),
+
+       TP_ARGS(iova, paddr, size)
+);
+
+DEFINE_EVENT_PRINT(iommu_map_unmap, unmap,
+
+       TP_PROTO(unsigned long iova, phys_addr_t paddr, size_t size),
+
+       TP_ARGS(iova, paddr, size),
+
+       TP_printk("IOMMU: iova=0x%016llx size=0x%x",
+                       __entry->iova, __entry->size
+       )
+);
+
+DECLARE_EVENT_CLASS(iommu_error,
+
+       TP_PROTO(struct device *dev, unsigned long iova, int flags),
+
+       TP_ARGS(dev, iova, flags),
+
+       TP_STRUCT__entry(
+               __string(device, dev_name(dev))
+               __string(driver, dev_driver_string(dev))
+               __field(u64, iova)
+               __field(int, flags)
+       ),
+
+       TP_fast_assign(
+               __assign_str(device, dev_name(dev));
+               __assign_str(driver, dev_driver_string(dev));
+               __entry->iova = iova;
+               __entry->flags = flags;
+       ),
+
+       TP_printk("IOMMU:%s %s iova=0x%016llx flags=0x%04x",
+                       __get_str(driver), __get_str(device),
+                       __entry->iova, __entry->flags
+       )
+);
+
+DEFINE_EVENT(iommu_error, io_page_fault,
+
+       TP_PROTO(struct device *dev, unsigned long iova, int flags),
+
+       TP_ARGS(dev, iova, flags)
+);
+#endif /* _TRACE_IOMMU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 7005d11..131a0bd 100644 (file)
@@ -296,23 +296,21 @@ DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready,
 
 TRACE_EVENT(
        kvm_async_pf_completed,
-       TP_PROTO(unsigned long address, struct page *page, u64 gva),
-       TP_ARGS(address, page, gva),
+       TP_PROTO(unsigned long address, u64 gva),
+       TP_ARGS(address, gva),
 
        TP_STRUCT__entry(
                __field(unsigned long, address)
-               __field(pfn_t, pfn)
                __field(u64, gva)
                ),
 
        TP_fast_assign(
                __entry->address = address;
-               __entry->pfn = page ? page_to_pfn(page) : 0;
                __entry->gva = gva;
                ),
 
-       TP_printk("gva %#llx address %#lx pfn %#llx",  __entry->gva,
-                 __entry->address, __entry->pfn)
+       TP_printk("gva %#llx address %#lx",  __entry->gva,
+                 __entry->address)
 );
 
 #endif
diff --git a/include/trace/events/swiotlb.h b/include/trace/events/swiotlb.h
new file mode 100644 (file)
index 0000000..7ea4c5e
--- /dev/null
@@ -0,0 +1,46 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM swiotlb
+
+#if !defined(_TRACE_SWIOTLB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SWIOTLB_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(swiotlb_bounced,
+
+       TP_PROTO(struct device *dev,
+                dma_addr_t dev_addr,
+                size_t size,
+                int swiotlb_force),
+
+       TP_ARGS(dev, dev_addr, size, swiotlb_force),
+
+       TP_STRUCT__entry(
+               __string(       dev_name,       dev_name(dev)   )
+               __field(        u64,    dma_mask                )
+               __field(        dma_addr_t,     dev_addr        )
+               __field(        size_t, size                    )
+               __field(        int,    swiotlb_force           )
+       ),
+
+       TP_fast_assign(
+               __assign_str(dev_name, dev_name(dev));
+               __entry->dma_mask = (dev->dma_mask ? *dev->dma_mask : 0);
+               __entry->dev_addr = dev_addr;
+               __entry->size = size;
+               __entry->swiotlb_force = swiotlb_force;
+       ),
+
+       TP_printk("dev_name: %s dma_mask=%llx dev_addr=%llx "
+               "size=%zu %s",
+               __get_str(dev_name),
+               __entry->dma_mask,
+               (unsigned long long)__entry->dev_addr,
+               __entry->size,
+               __entry->swiotlb_force ? "swiotlb_force" : "" )
+);
+
+#endif /*  _TRACE_SWIOTLB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 99c2533..902f124 100644 (file)
@@ -518,6 +518,10 @@ struct kvm_ppc_smmu_info {
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
 #define KVM_VM_S390_UCONTROL   1
 
+/* on ppc, 0 indicate default, 1 should force HV and 2 PR */
+#define KVM_VM_PPC_HV 1
+#define KVM_VM_PPC_PR 2
+
 #define KVM_S390_SIE_PAGE_OFFSET 1
 
 /*
@@ -541,6 +545,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_TRACE_ENABLE          __KVM_DEPRECATED_MAIN_W_0x06
 #define KVM_TRACE_PAUSE           __KVM_DEPRECATED_MAIN_0x07
 #define KVM_TRACE_DISABLE         __KVM_DEPRECATED_MAIN_0x08
+#define KVM_GET_EMULATED_CPUID   _IOWR(KVMIO, 0x09, struct kvm_cpuid2)
 
 /*
  * Extension capability list.
@@ -668,6 +673,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_ARM_EL1_32BIT 93
 #define KVM_CAP_SPAPR_MULTITCE 94
+#define KVM_CAP_EXT_EMUL_CPUID 95
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -843,6 +849,10 @@ struct kvm_device_attr {
 #define KVM_DEV_TYPE_FSL_MPIC_20       1
 #define KVM_DEV_TYPE_FSL_MPIC_42       2
 #define KVM_DEV_TYPE_XICS              3
+#define KVM_DEV_TYPE_VFIO              4
+#define  KVM_DEV_VFIO_GROUP                    1
+#define   KVM_DEV_VFIO_GROUP_ADD                       1
+#define   KVM_DEV_VFIO_GROUP_DEL                       2
 
 /*
  * ioctls for VM fds
@@ -1012,6 +1022,7 @@ struct kvm_s390_ucas_mapping {
 /* VM is being stopped by host */
 #define KVM_KVMCLOCK_CTRL        _IO(KVMIO,   0xad)
 #define KVM_ARM_VCPU_INIT        _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
+#define KVM_ARM_PREFERRED_TARGET  _IOR(KVMIO,  0xaf, struct kvm_vcpu_init)
 #define KVM_GET_REG_LIST         _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
index 2944278..77c6031 100644 (file)
@@ -71,6 +71,6 @@
 #define USBDEVICE_SUPER_MAGIC  0x9fa2
 #define MTD_INODE_FS_MAGIC      0x11307854
 #define ANON_INODE_FS_MAGIC    0x09041934
-
+#define BTRFS_TEST_MAGIC       0x73727279
 
 #endif /* __LINUX_MAGIC_H__ */
index 7000bb1..42721d1 100644 (file)
@@ -231,6 +231,17 @@ struct physdev_get_free_pirq {
 #define XEN_PCI_DEV_VIRTFN             0x2
 #define XEN_PCI_DEV_PXM                0x4
 
+#define XEN_PCI_MMCFG_RESERVED         0x1
+
+#define PHYSDEVOP_pci_mmcfg_reserved    24
+struct physdev_pci_mmcfg_reserved {
+    uint64_t address;
+    uint16_t segment;
+    uint8_t start_bus;
+    uint8_t end_bus;
+    uint32_t flags;
+};
+
 #define PHYSDEVOP_pci_device_add        25
 struct physdev_pci_device_add {
     /* IN */
index de8bcc6..8b2eb93 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef __LINUX_SWIOTLB_XEN_H
 #define __LINUX_SWIOTLB_XEN_H
 
+#include <linux/dma-direction.h>
 #include <linux/swiotlb.h>
 
 extern int xen_swiotlb_init(int verbose, bool early);
@@ -55,4 +56,6 @@ xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
 extern int
 xen_swiotlb_dma_supported(struct device *hwdev, u64 mask);
 
+extern int
+xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask);
 #endif /* __LINUX_SWIOTLB_XEN_H */
index d6fe062..fb2ea8f 100644 (file)
@@ -19,10 +19,11 @@ void xen_arch_resume(void);
 int xen_setup_shutdown_event(void);
 
 extern unsigned long *xen_contiguous_bitmap;
-int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
-                               unsigned int address_bits);
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+                               unsigned int address_bits,
+                               dma_addr_t *dma_handle);
 
-void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order);
 
 struct vm_area_struct;
 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
index 6ad1a53..01573fd 100644 (file)
@@ -131,6 +131,8 @@ char __initdata boot_command_line[COMMAND_LINE_SIZE];
 char *saved_command_line;
 /* Command line for parameter parsing */
 static char *static_command_line;
+/* Command line for per-initcall parameter parsing */
+static char *initcall_command_line;
 
 static char *execute_command;
 static char *ramdisk_execute_command;
@@ -354,6 +356,7 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 static void __init setup_command_line(char *command_line)
 {
        saved_command_line = alloc_bootmem(strlen (boot_command_line)+1);
+       initcall_command_line = alloc_bootmem(strlen (boot_command_line)+1);
        static_command_line = alloc_bootmem(strlen (command_line)+1);
        strcpy (saved_command_line, boot_command_line);
        strcpy (static_command_line, command_line);
@@ -473,7 +476,7 @@ static void __init mm_init(void)
        mem_init();
        kmem_cache_init();
        percpu_init_late();
-       pgtable_cache_init();
+       pgtable_init();
        vmalloc_init();
 }
 
@@ -751,9 +754,9 @@ static void __init do_initcall_level(int level)
        extern const struct kernel_param __start___param[], __stop___param[];
        initcall_t *fn;
 
-       strcpy(static_command_line, saved_command_line);
+       strcpy(initcall_command_line, saved_command_line);
        parse_args(initcall_level_names[level],
-                  static_command_line, __start___param,
+                  initcall_command_line, __start___param,
                   __stop___param - __start___param,
                   level, level,
                   &repair_env_string);
index 94fabd5..2a202a8 100644 (file)
@@ -55,4 +55,4 @@ config HZ
        default 1000 if HZ_1000
 
 config SCHED_HRTICK
-       def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
+       def_bool HIGH_RES_TIMERS
index e8ca97b..578782e 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kbuild.h>
 #include <linux/page_cgroup.h>
 #include <linux/log2.h>
+#include <linux/spinlock.h>
 
 void foo(void)
 {
@@ -21,5 +22,6 @@ void foo(void)
 #ifdef CONFIG_SMP
        DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
 #endif
+       DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int));
        /* End of constants */
 }
index f6d11fc..728d5be 100644 (file)
@@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        mm->flags = (current->mm) ?
                (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
        mm->core_state = NULL;
-       mm->nr_ptes = 0;
+       atomic_long_set(&mm->nr_ptes, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        mm_init_aio(mm);
@@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm)
                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
        }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        VM_BUG_ON(mm->pmd_huge_pte);
 #endif
 }
@@ -814,7 +814,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
        mm_init_cpumask(mm);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
 #endif
        if (!mm_init(mm, tsk))
index 8807061..9328b80 100644 (file)
@@ -207,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
        return ret;
 }
 
+static atomic_t reset_hung_task = ATOMIC_INIT(0);
+
+void reset_hung_task_detector(void)
+{
+       atomic_set(&reset_hung_task, 1);
+}
+EXPORT_SYMBOL_GPL(reset_hung_task_detector);
+
 /*
  * kthread which checks for tasks stuck in D state
  */
@@ -220,6 +228,9 @@ static int watchdog(void *dummy)
                while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
                        timeout = sysctl_hung_task_timeout_secs;
 
+               if (atomic_xchg(&reset_hung_task, 0))
+                       continue;
+
                check_hung_uninterruptible_tasks(timeout);
        }
 
index af5ebd2..f5a3b1e 100644 (file)
@@ -641,8 +641,6 @@ static int module_unload_init(struct module *mod)
 
        /* Hold reference count during initialization. */
        __this_cpu_write(mod->refptr->incs, 1);
-       /* Backwards compatibility macros put refcount during init. */
-       mod->waiter = current;
 
        return 0;
 }
@@ -768,16 +766,9 @@ static int __try_stop_module(void *_sref)
 
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-       if (flags & O_NONBLOCK) {
-               struct stopref sref = { mod, flags, forced };
+       struct stopref sref = { mod, flags, forced };
 
-               return stop_machine(__try_stop_module, &sref, NULL);
-       } else {
-               /* We don't need to stop the machine for this. */
-               mod->state = MODULE_STATE_GOING;
-               synchronize_sched();
-               return 0;
-       }
+       return stop_machine(__try_stop_module, &sref, NULL);
 }
 
 unsigned long module_refcount(struct module *mod)
@@ -810,21 +801,6 @@ EXPORT_SYMBOL(module_refcount);
 /* This exists whether we can unload or not */
 static void free_module(struct module *mod);
 
-static void wait_for_zero_refcount(struct module *mod)
-{
-       /* Since we might sleep for some time, release the mutex first */
-       mutex_unlock(&module_mutex);
-       for (;;) {
-               pr_debug("Looking at refcount...\n");
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               if (module_refcount(mod) == 0)
-                       break;
-               schedule();
-       }
-       current->state = TASK_RUNNING;
-       mutex_lock(&module_mutex);
-}
-
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                unsigned int, flags)
 {
@@ -839,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
 
+       if (!(flags & O_NONBLOCK)) {
+               printk(KERN_WARNING
+                      "waiting module removal not supported: please upgrade");
+       }
+
        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
 
@@ -856,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 
        /* Doing init or already dying? */
        if (mod->state != MODULE_STATE_LIVE) {
-               /* FIXME: if (force), slam module count and wake up
-                   waiter --RR */
+               /* FIXME: if (force), slam module count damn the torpedoes */
                pr_debug("%s already dying\n", mod->name);
                ret = -EBUSY;
                goto out;
@@ -873,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                }
        }
 
-       /* Set this up before setting mod->state */
-       mod->waiter = current;
-
        /* Stop the machine so refcounts can't move and disable module. */
        ret = try_stop_module(mod, flags, &forced);
        if (ret != 0)
                goto out;
 
-       /* Never wait if forced. */
-       if (!forced && module_refcount(mod) != 0)
-               wait_for_zero_refcount(mod);
-
        mutex_unlock(&module_mutex);
        /* Final destruction now no one is using it. */
        if (mod->exit != NULL)
@@ -1002,9 +975,6 @@ void module_put(struct module *module)
                __this_cpu_inc(module->refptr->decs);
 
                trace_module_put(module, _RET_IP_);
-               /* Maybe they're waiting for us to drop reference? */
-               if (unlikely(!module_is_live(module)))
-                       wake_up_process(module->waiter);
                preempt_enable();
        }
 }
@@ -2728,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
        return 0;
 }
 
-static void find_module_sections(struct module *mod, struct load_info *info)
+static int find_module_sections(struct module *mod, struct load_info *info)
 {
        mod->kp = section_objs(info, "__param",
                               sizeof(*mod->kp), &mod->num_kp);
@@ -2758,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 #ifdef CONFIG_CONSTRUCTORS
        mod->ctors = section_objs(info, ".ctors",
                                  sizeof(*mod->ctors), &mod->num_ctors);
+       if (!mod->ctors)
+               mod->ctors = section_objs(info, ".init_array",
+                               sizeof(*mod->ctors), &mod->num_ctors);
+       else if (find_sec(info, ".init_array")) {
+               /*
+                * This shouldn't happen with same compiler and binutils
+                * building all parts of the module.
+                */
+               printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
+                      mod->name);
+               return -EINVAL;
+       }
 #endif
 
 #ifdef CONFIG_TRACEPOINTS
@@ -2795,6 +2777,8 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 
        info->debug = section_objs(info, "__verbose",
                                   sizeof(*info->debug), &info->num_debug);
+
+       return 0;
 }
 
 static int move_module(struct module *mod, struct load_info *info)
@@ -3248,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
        /* Now we've got everything in the final locations, we can
         * find optional sections. */
-       find_module_sections(mod, info);
+       err = find_module_sections(mod, info);
+       if (err)
+               goto free_unload;
 
        err = check_module_license_and_versions(mod);
        if (err)
index 4611610..bd9f940 100644 (file)
@@ -15,7 +15,6 @@
 
 #include "smpboot.h"
 
-#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 enum {
        CSD_FLAG_LOCK           = 0x01,
        CSD_FLAG_WAIT           = 0x02,
@@ -140,8 +139,7 @@ static void csd_unlock(struct call_single_data *csd)
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static
-void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
+static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
 {
        struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
        unsigned long flags;
@@ -464,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
        return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
-#endif /* USE_GENERIC_SMP_HELPERS */
 
 /* Setup configured maximum number of CPUs to activate */
 unsigned int setup_max_cpus = NR_CPUS;
index b249883..11025cc 100644 (file)
@@ -6,8 +6,6 @@
  *     Distribute under GPLv2.
  *
  *     Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
- *
- *     Remote softirq infrastructure is by Jens Axboe.
  */
 
 #include <linux/export.h>
@@ -627,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
 }
 EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
 
-/*
- * Remote softirq bits
- */
-
-DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
-EXPORT_PER_CPU_SYMBOL(softirq_work_list);
-
-static void __local_trigger(struct call_single_data *cp, int softirq)
-{
-       struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
-
-       list_add_tail(&cp->list, head);
-
-       /* Trigger the softirq only if the list was previously empty.  */
-       if (head->next == &cp->list)
-               raise_softirq_irqoff(softirq);
-}
-
-#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static void remote_softirq_receive(void *data)
-{
-       struct call_single_data *cp = data;
-       unsigned long flags;
-       int softirq;
-
-       softirq = *(int *)cp->info;
-       local_irq_save(flags);
-       __local_trigger(cp, softirq);
-       local_irq_restore(flags);
-}
-
-static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
-       if (cpu_online(cpu)) {
-               cp->func = remote_softirq_receive;
-               cp->info = &softirq;
-               cp->flags = 0;
-
-               __smp_call_function_single(cpu, cp, 0);
-               return 0;
-       }
-       return 1;
-}
-#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
-static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
-       return 1;
-}
-#endif
-
-/**
- * __send_remote_softirq - try to schedule softirq work on a remote cpu
- * @cp: private SMP call function data area
- * @cpu: the remote cpu
- * @this_cpu: the currently executing cpu
- * @softirq: the softirq for the work
- *
- * Attempt to schedule softirq work on a remote cpu.  If this cannot be
- * done, the work is instead queued up on the local cpu.
- *
- * Interrupts must be disabled.
- */
-void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
-{
-       if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
-               __local_trigger(cp, softirq);
-}
-EXPORT_SYMBOL(__send_remote_softirq);
-
-/**
- * send_remote_softirq - try to schedule softirq work on a remote cpu
- * @cp: private SMP call function data area
- * @cpu: the remote cpu
- * @softirq: the softirq for the work
- *
- * Like __send_remote_softirq except that disabling interrupts and
- * computing the current cpu is done for the caller.
- */
-void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
-{
-       unsigned long flags;
-       int this_cpu;
-
-       local_irq_save(flags);
-       this_cpu = smp_processor_id();
-       __send_remote_softirq(cp, cpu, this_cpu, softirq);
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL(send_remote_softirq);
-
-static int remote_softirq_cpu_notify(struct notifier_block *self,
-                                              unsigned long action, void *hcpu)
-{
-       /*
-        * If a CPU goes away, splice its entries to the current CPU
-        * and trigger a run of the softirq
-        */
-       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-               int cpu = (unsigned long) hcpu;
-               int i;
-
-               local_irq_disable();
-               for (i = 0; i < NR_SOFTIRQS; i++) {
-                       struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
-                       struct list_head *local_head;
-
-                       if (list_empty(head))
-                               continue;
-
-                       local_head = &__get_cpu_var(softirq_work_list[i]);
-                       list_splice_init(head, local_head);
-                       raise_softirq_irqoff(i);
-               }
-               local_irq_enable();
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block remote_softirq_cpu_notifier = {
-       .notifier_call  = remote_softirq_cpu_notify,
-};
-
 void __init softirq_init(void)
 {
        int cpu;
 
        for_each_possible_cpu(cpu) {
-               int i;
-
                per_cpu(tasklet_vec, cpu).tail =
                        &per_cpu(tasklet_vec, cpu).head;
                per_cpu(tasklet_hi_vec, cpu).tail =
                        &per_cpu(tasklet_hi_vec, cpu).head;
-               for (i = 0; i < NR_SOFTIRQS; i++)
-                       INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
        }
 
-       register_hotcpu_notifier(&remote_softirq_cpu_notifier);
-
        open_softirq(TASKLET_SOFTIRQ, tasklet_action);
        open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
index 630d72b..509403e 100644 (file)
@@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
+void __smp_call_function_single(int cpu, struct call_single_data *csd,
+                               int wait)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       csd->func(csd->info);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__smp_call_function_single);
+
 int on_each_cpu(smp_call_func_t func, void *info, int wait)
 {
        unsigned long flags;
index 75485e1..06dc742 100644 (file)
@@ -51,13 +51,6 @@ config PERCPU_RWSEM
 config ARCH_USE_CMPXCHG_LOCKREF
        bool
 
-config CMPXCHG_LOCKREF
-       def_bool y if ARCH_USE_CMPXCHG_LOCKREF
-       depends on SMP
-       depends on !GENERIC_LOCKBREAK
-       depends on !DEBUG_SPINLOCK
-       depends on !DEBUG_LOCK_ALLOC
-
 config CRC_CCITT
        tristate "CRC-CCITT functions"
        help
index 7b7f830..d79b9d2 100644 (file)
@@ -215,7 +215,7 @@ static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
         * incrementing the fifo->in index counter
         */
        smp_wmb();
-       *copied = len - ret;
+       *copied = len - ret * esize;
        /* return the number of elements which are not copied */
        return ret;
 }
@@ -275,7 +275,7 @@ static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
         * incrementing the fifo->out index counter
         */
        smp_wmb();
-       *copied = len - ret;
+       *copied = len - ret * esize;
        /* return the number of elements which are not copied */
        return ret;
 }
index 4a70d12..f76196d 100644 (file)
@@ -81,3 +81,25 @@ struct llist_node *llist_del_first(struct llist_head *head)
        return entry;
 }
 EXPORT_SYMBOL_GPL(llist_del_first);
+
+/**
+ * llist_reverse_order - reverse order of a llist chain
+ * @head:      first item of the list to be reversed
+ *
+ * Reverse the order of a chain of llist entries and return the
+ * new first entry.
+ */
+struct llist_node *llist_reverse_order(struct llist_node *head)
+{
+       struct llist_node *new_head = NULL;
+
+       while (head) {
+               struct llist_node *tmp = head;
+               head = head->next;
+               tmp->next = new_head;
+               new_head = tmp;
+       }
+
+       return new_head;
+}
+EXPORT_SYMBOL_GPL(llist_reverse_order);
index af6e95d..d2b123f 100644 (file)
@@ -1,7 +1,7 @@
 #include <linux/export.h>
 #include <linux/lockref.h>
 
-#ifdef CONFIG_CMPXCHG_LOCKREF
+#if USE_CMPXCHG_LOCKREF
 
 /*
  * Allow weakly-ordered memory architectures to provide barrier-less
index 4e8686c..e4399fa 100644 (file)
@@ -38,6 +38,9 @@
 #include <linux/bootmem.h>
 #include <linux/iommu-helper.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/swiotlb.h>
+
 #define OFFSET(val,align) ((unsigned long)     \
                           ( (val) & ( (align) - 1)))
 
@@ -502,6 +505,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 
 not_found:
        spin_unlock_irqrestore(&io_tlb_lock, flags);
+       dev_warn(hwdev, "swiotlb buffer is full\n");
        return SWIOTLB_MAP_ERROR;
 found:
        spin_unlock_irqrestore(&io_tlb_lock, flags);
@@ -726,6 +730,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
        if (dma_capable(dev, dev_addr, size) && !swiotlb_force)
                return dev_addr;
 
+       trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
+
        /* Oh well, have to allocate and map a bounce buffer. */
        map = map_single(dev, phys, size, dir);
        if (map == SWIOTLB_MAP_ERROR) {
index 48586ac..10909c5 100644 (file)
@@ -1712,18 +1712,16 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
                        break;
 
                case FORMAT_TYPE_NRCHARS: {
-                       u8 qualifier = spec.qualifier;
+                       /*
+                        * Since %n poses a greater security risk than
+                        * utility, ignore %n and skip its argument.
+                        */
+                       void *skip_arg;
 
-                       if (qualifier == 'l') {
-                               long *ip = va_arg(args, long *);
-                               *ip = (str - buf);
-                       } else if (_tolower(qualifier) == 'z') {
-                               size_t *ip = va_arg(args, size_t *);
-                               *ip = (str - buf);
-                       } else {
-                               int *ip = va_arg(args, int *);
-                               *ip = (str - buf);
-                       }
+                       WARN_ONCE(1, "Please remove ignored %%n in '%s'\n",
+                                       old_fmt);
+
+                       skip_arg = va_arg(args, void *);
                        break;
                }
 
index 3f4ffda..de31af2 100644 (file)
@@ -218,9 +218,11 @@ config SPLIT_PTLOCK_CPUS
        int
        default "999999" if ARM && !CPU_CACHE_VIPT
        default "999999" if PARISC && !PA20
-       default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
        default "4"
 
+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+       boolean
+
 #
 # support for memory balloon compaction
 config BALLOON_COMPACTION
index ae4846f..b7749a9 100644 (file)
@@ -1090,7 +1090,6 @@ static void shrink_readahead_size_eio(struct file *filp,
  * @filp:      the file to read
  * @ppos:      current file position
  * @desc:      read_descriptor
- * @actor:     read method
  *
  * This is a generic file read routine, and uses the
  * mapping->a_ops->readpage() function for the actual low-level stuff.
@@ -1099,7 +1098,7 @@ static void shrink_readahead_size_eio(struct file *filp,
  * of the logic when it comes to error handling etc.
  */
 static void do_generic_file_read(struct file *filp, loff_t *ppos,
-               read_descriptor_t *desc, read_actor_t actor)
+               read_descriptor_t *desc)
 {
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
@@ -1200,13 +1199,14 @@ page_ok:
                 * Ok, we have the page, and it's up-to-date, so
                 * now we can copy it to user space...
                 *
-                * The actor routine returns how many bytes were actually used..
+                * The file_read_actor routine returns how many bytes were
+                * actually used..
                 * NOTE! This may not be the same as how much of a user buffer
                 * we filled up (we may be padding etc), so we can only update
                 * "pos" here (the actor routine has to update the user buffer
                 * pointers and the remaining count).
                 */
-               ret = actor(desc, page, offset, nr);
+               ret = file_read_actor(desc, page, offset, nr);
                offset += ret;
                index += offset >> PAGE_CACHE_SHIFT;
                offset &= ~PAGE_CACHE_MASK;
@@ -1479,7 +1479,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                if (desc.count == 0)
                        continue;
                desc.error = 0;
-               do_generic_file_read(filp, ppos, &desc, file_read_actor);
+               do_generic_file_read(filp, ppos, &desc);
                retval += desc.written;
                if (desc.error) {
                        retval = retval ?: desc.error;
index 0556c6a..bccd5a6 100644 (file)
@@ -710,6 +710,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct page *page)
 {
        pgtable_t pgtable;
+       spinlock_t *ptl;
 
        VM_BUG_ON(!PageCompound(page));
        pgtable = pte_alloc_one(mm, haddr);
@@ -724,9 +725,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
         */
        __SetPageUptodate(page);
 
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_none(*pmd))) {
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
                mem_cgroup_uncharge_page(page);
                put_page(page);
                pte_free(mm, pgtable);
@@ -738,8 +739,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
-               mm->nr_ptes++;
-               spin_unlock(&mm->page_table_lock);
+               atomic_long_inc(&mm->nr_ptes);
+               spin_unlock(ptl);
        }
 
        return 0;
@@ -759,6 +760,7 @@ static inline struct page *alloc_hugepage_vma(int defrag,
                               HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 
+/* Caller must hold page table lock. */
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct page *zero_page)
@@ -771,7 +773,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
-       mm->nr_ptes++;
+       atomic_long_inc(&mm->nr_ptes);
        return true;
 }
 
@@ -790,6 +792,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                return VM_FAULT_OOM;
        if (!(flags & FAULT_FLAG_WRITE) &&
                        transparent_hugepage_use_zero_page()) {
+               spinlock_t *ptl;
                pgtable_t pgtable;
                struct page *zero_page;
                bool set;
@@ -802,10 +805,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        count_vm_event(THP_FAULT_FALLBACK);
                        return VM_FAULT_FALLBACK;
                }
-               spin_lock(&mm->page_table_lock);
+               ptl = pmd_lock(mm, pmd);
                set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
                                zero_page);
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
                if (!set) {
                        pte_free(mm, pgtable);
                        put_huge_zero_page();
@@ -838,6 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *vma)
 {
+       spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
        pmd_t pmd;
        pgtable_t pgtable;
@@ -848,8 +852,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (unlikely(!pgtable))
                goto out;
 
-       spin_lock(&dst_mm->page_table_lock);
-       spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+       dst_ptl = pmd_lock(dst_mm, dst_pmd);
+       src_ptl = pmd_lockptr(src_mm, src_pmd);
+       spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 
        ret = -EAGAIN;
        pmd = *src_pmd;
@@ -858,7 +863,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_unlock;
        }
        /*
-        * mm->page_table_lock is enough to be sure that huge zero pmd is not
+        * When page table lock is held, the huge zero pmd should not be
         * under splitting since we don't split the page itself, only pmd to
         * a page table.
         */
@@ -879,8 +884,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        }
        if (unlikely(pmd_trans_splitting(pmd))) {
                /* split huge page running from under us */
-               spin_unlock(&src_mm->page_table_lock);
-               spin_unlock(&dst_mm->page_table_lock);
+               spin_unlock(src_ptl);
+               spin_unlock(dst_ptl);
                pte_free(dst_mm, pgtable);
 
                wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -896,12 +901,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-       dst_mm->nr_ptes++;
+       atomic_long_inc(&dst_mm->nr_ptes);
 
        ret = 0;
 out_unlock:
-       spin_unlock(&src_mm->page_table_lock);
-       spin_unlock(&dst_mm->page_table_lock);
+       spin_unlock(src_ptl);
+       spin_unlock(dst_ptl);
 out:
        return ret;
 }
@@ -912,10 +917,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
                           pmd_t *pmd, pmd_t orig_pmd,
                           int dirty)
 {
+       spinlock_t *ptl;
        pmd_t entry;
        unsigned long haddr;
 
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto unlock;
 
@@ -925,13 +931,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
                update_mmu_cache_pmd(vma, address, pmd);
 
 unlock:
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
 }
 
 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long address,
                pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
 {
+       spinlock_t *ptl;
        pgtable_t pgtable;
        pmd_t _pmd;
        struct page *page;
@@ -958,7 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
        mmun_end   = haddr + HPAGE_PMD_SIZE;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_free_page;
 
@@ -985,7 +992,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
        }
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        put_huge_zero_page();
        inc_mm_counter(mm, MM_ANONPAGES);
 
@@ -995,7 +1002,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 out:
        return ret;
 out_free_page:
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        mem_cgroup_uncharge_page(page);
        put_page(page);
@@ -1009,6 +1016,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct page *page,
                                        unsigned long haddr)
 {
+       spinlock_t *ptl;
        pgtable_t pgtable;
        pmd_t _pmd;
        int ret = 0, i;
@@ -1055,7 +1063,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        mmun_end   = haddr + HPAGE_PMD_SIZE;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_free_pages;
        VM_BUG_ON(!PageHead(page));
@@ -1081,7 +1089,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
        page_remove_rmap(page);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
 
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
@@ -1092,7 +1100,7 @@ out:
        return ret;
 
 out_free_pages:
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        mem_cgroup_uncharge_start();
        for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1107,17 +1115,19 @@ out_free_pages:
 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
 {
+       spinlock_t *ptl;
        int ret = 0;
        struct page *page = NULL, *new_page;
        unsigned long haddr;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
 
+       ptl = pmd_lockptr(mm, pmd);
        VM_BUG_ON(!vma->anon_vma);
        haddr = address & HPAGE_PMD_MASK;
        if (is_huge_zero_pmd(orig_pmd))
                goto alloc;
-       spin_lock(&mm->page_table_lock);
+       spin_lock(ptl);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_unlock;
 
@@ -1133,7 +1143,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_unlock;
        }
        get_page(page);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
@@ -1180,11 +1190,11 @@ alloc:
        mmun_end   = haddr + HPAGE_PMD_SIZE;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
-       spin_lock(&mm->page_table_lock);
+       spin_lock(ptl);
        if (page)
                put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
                mem_cgroup_uncharge_page(new_page);
                put_page(new_page);
                goto out_mn;
@@ -1206,13 +1216,13 @@ alloc:
                }
                ret |= VM_FAULT_WRITE;
        }
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
 out_mn:
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return ret;
 out_unlock:
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        return ret;
 }
 
@@ -1224,7 +1234,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
 
-       assert_spin_locked(&mm->page_table_lock);
+       assert_spin_locked(pmd_lockptr(mm, pmd));
 
        if (flags & FOLL_WRITE && !pmd_write(*pmd))
                goto out;
@@ -1271,6 +1281,7 @@ out:
 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
+       spinlock_t *ptl;
        struct anon_vma *anon_vma = NULL;
        struct page *page;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
@@ -1280,7 +1291,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        bool migrated = false;
        int flags = 0;
 
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmdp);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
 
@@ -1318,7 +1329,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * relock and check_same as the page may no longer be mapped.
                 * As the fault is being retried, do not account for it.
                 */
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
                wait_on_page_locked(page);
                page_nid = -1;
                goto out;
@@ -1326,13 +1337,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        /* Page is misplaced, serialise migrations and parallel THP splits */
        get_page(page);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        if (!page_locked)
                lock_page(page);
        anon_vma = page_lock_anon_vma_read(page);
 
        /* Confirm the PMD did not change while page_table_lock was released */
-       spin_lock(&mm->page_table_lock);
+       spin_lock(ptl);
        if (unlikely(!pmd_same(pmd, *pmdp))) {
                unlock_page(page);
                put_page(page);
@@ -1344,7 +1355,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * Migrate the THP to the requested node, returns with page unlocked
         * and pmd_numa cleared.
         */
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        migrated = migrate_misplaced_transhuge_page(mm, vma,
                                pmdp, pmd, addr, page, target_nid);
        if (migrated) {
@@ -1361,7 +1372,7 @@ clear_pmdnuma:
        update_mmu_cache_pmd(vma, addr, pmdp);
        unlock_page(page);
 out_unlock:
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
 
 out:
        if (anon_vma)
@@ -1376,9 +1387,10 @@ out:
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
+       spinlock_t *ptl;
        int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                struct page *page;
                pgtable_t pgtable;
                pmd_t orig_pmd;
@@ -1392,8 +1404,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
                if (is_huge_zero_pmd(orig_pmd)) {
-                       tlb->mm->nr_ptes--;
-                       spin_unlock(&tlb->mm->page_table_lock);
+                       atomic_long_dec(&tlb->mm->nr_ptes);
+                       spin_unlock(ptl);
                        put_huge_zero_page();
                } else {
                        page = pmd_page(orig_pmd);
@@ -1401,8 +1413,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                        VM_BUG_ON(page_mapcount(page) < 0);
                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                        VM_BUG_ON(!PageHead(page));
-                       tlb->mm->nr_ptes--;
-                       spin_unlock(&tlb->mm->page_table_lock);
+                       atomic_long_dec(&tlb->mm->nr_ptes);
+                       spin_unlock(ptl);
                        tlb_remove_page(tlb, page);
                }
                pte_free(tlb->mm, pgtable);
@@ -1415,14 +1427,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end,
                unsigned char *vec)
 {
+       spinlock_t *ptl;
        int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                /*
                 * All logical pages in the range are present
                 * if backed by a huge page.
                 */
-               spin_unlock(&vma->vm_mm->page_table_lock);
+               spin_unlock(ptl);
                memset(vec, 1, (end - addr) >> PAGE_SHIFT);
                ret = 1;
        }
@@ -1435,6 +1448,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                  unsigned long new_addr, unsigned long old_end,
                  pmd_t *old_pmd, pmd_t *new_pmd)
 {
+       spinlock_t *old_ptl, *new_ptl;
        int ret = 0;
        pmd_t pmd;
 
@@ -1455,12 +1469,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                goto out;
        }
 
-       ret = __pmd_trans_huge_lock(old_pmd, vma);
+       /*
+        * We don't have to worry about the ordering of src and dst
+        * ptlocks because exclusive mmap_sem prevents deadlock.
+        */
+       ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
        if (ret == 1) {
+               new_ptl = pmd_lockptr(mm, new_pmd);
+               if (new_ptl != old_ptl)
+                       spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
                VM_BUG_ON(!pmd_none(*new_pmd));
                set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-               spin_unlock(&mm->page_table_lock);
+               if (new_ptl != old_ptl)
+                       spin_unlock(new_ptl);
+               spin_unlock(old_ptl);
        }
 out:
        return ret;
@@ -1476,9 +1499,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, pgprot_t newprot, int prot_numa)
 {
        struct mm_struct *mm = vma->vm_mm;
+       spinlock_t *ptl;
        int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                pmd_t entry;
                ret = 1;
                if (!prot_numa) {
@@ -1507,7 +1531,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                if (ret == HPAGE_PMD_NR)
                        set_pmd_at(mm, addr, pmd, entry);
 
-               spin_unlock(&vma->vm_mm->page_table_lock);
+               spin_unlock(ptl);
        }
 
        return ret;
@@ -1520,12 +1544,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  * Note that if it returns 1, this routine returns without unlocking page
  * table locks. So callers must unlock them.
  */
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+               spinlock_t **ptl)
 {
-       spin_lock(&vma->vm_mm->page_table_lock);
+       *ptl = pmd_lock(vma->vm_mm, pmd);
        if (likely(pmd_trans_huge(*pmd))) {
                if (unlikely(pmd_trans_splitting(*pmd))) {
-                       spin_unlock(&vma->vm_mm->page_table_lock);
+                       spin_unlock(*ptl);
                        wait_split_huge_page(vma->anon_vma, pmd);
                        return -1;
                } else {
@@ -1534,27 +1559,37 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
                        return 1;
                }
        }
-       spin_unlock(&vma->vm_mm->page_table_lock);
+       spin_unlock(*ptl);
        return 0;
 }
 
+/*
+ * This function returns whether a given @page is mapped onto the @address
+ * in the virtual space of @mm.
+ *
+ * When it's true, this function returns *pmd with holding the page table lock
+ * and passing it back to the caller via @ptl.
+ * If it's false, returns NULL without holding the page table lock.
+ */
 pmd_t *page_check_address_pmd(struct page *page,
                              struct mm_struct *mm,
                              unsigned long address,
-                             enum page_check_address_pmd_flag flag)
+                             enum page_check_address_pmd_flag flag,
+                             spinlock_t **ptl)
 {
-       pmd_t *pmd, *ret = NULL;
+       pmd_t *pmd;
 
        if (address & ~HPAGE_PMD_MASK)
-               goto out;
+               return NULL;
 
        pmd = mm_find_pmd(mm, address);
        if (!pmd)
-               goto out;
+               return NULL;
+       *ptl = pmd_lock(mm, pmd);
        if (pmd_none(*pmd))
-               goto out;
+               goto unlock;
        if (pmd_page(*pmd) != page)
-               goto out;
+               goto unlock;
        /*
         * split_vma() may create temporary aliased mappings. There is
         * no risk as long as all huge pmd are found and have their
@@ -1564,14 +1599,15 @@ pmd_t *page_check_address_pmd(struct page *page,
         */
        if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
            pmd_trans_splitting(*pmd))
-               goto out;
+               goto unlock;
        if (pmd_trans_huge(*pmd)) {
                VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
                          !pmd_trans_splitting(*pmd));
-               ret = pmd;
+               return pmd;
        }
-out:
-       return ret;
+unlock:
+       spin_unlock(*ptl);
+       return NULL;
 }
 
 static int __split_huge_page_splitting(struct page *page,
@@ -1579,6 +1615,7 @@ static int __split_huge_page_splitting(struct page *page,
                                       unsigned long address)
 {
        struct mm_struct *mm = vma->vm_mm;
+       spinlock_t *ptl;
        pmd_t *pmd;
        int ret = 0;
        /* For mmu_notifiers */
@@ -1586,9 +1623,8 @@ static int __split_huge_page_splitting(struct page *page,
        const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
 
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       spin_lock(&mm->page_table_lock);
        pmd = page_check_address_pmd(page, mm, address,
-                                    PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+                       PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
        if (pmd) {
                /*
                 * We can't temporarily set the pmd to null in order
@@ -1599,8 +1635,8 @@ static int __split_huge_page_splitting(struct page *page,
                 */
                pmdp_splitting_flush(vma, address, pmd);
                ret = 1;
+               spin_unlock(ptl);
        }
-       spin_unlock(&mm->page_table_lock);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
        return ret;
@@ -1731,14 +1767,14 @@ static int __split_huge_page_map(struct page *page,
                                 unsigned long address)
 {
        struct mm_struct *mm = vma->vm_mm;
+       spinlock_t *ptl;
        pmd_t *pmd, _pmd;
        int ret = 0, i;
        pgtable_t pgtable;
        unsigned long haddr;
 
-       spin_lock(&mm->page_table_lock);
        pmd = page_check_address_pmd(page, mm, address,
-                                    PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+                       PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
        if (pmd) {
                pgtable = pgtable_trans_huge_withdraw(mm, pmd);
                pmd_populate(mm, &_pmd, pgtable);
@@ -1793,8 +1829,8 @@ static int __split_huge_page_map(struct page *page,
                pmdp_invalidate(vma, address, pmd);
                pmd_populate(mm, pmd, pgtable);
                ret = 1;
+               spin_unlock(ptl);
        }
-       spin_unlock(&mm->page_table_lock);
 
        return ret;
 }
@@ -2346,7 +2382,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte_t *pte;
        pgtable_t pgtable;
        struct page *new_page;
-       spinlock_t *ptl;
+       spinlock_t *pmd_ptl, *pte_ptl;
        int isolated;
        unsigned long hstart, hend;
        unsigned long mmun_start;       /* For mmu_notifiers */
@@ -2389,12 +2425,12 @@ static void collapse_huge_page(struct mm_struct *mm,
        anon_vma_lock_write(vma->anon_vma);
 
        pte = pte_offset_map(pmd, address);
-       ptl = pte_lockptr(mm, pmd);
+       pte_ptl = pte_lockptr(mm, pmd);
 
        mmun_start = address;
        mmun_end   = address + HPAGE_PMD_SIZE;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       spin_lock(&mm->page_table_lock); /* probably unnecessary */
+       pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
        /*
         * After this gup_fast can't run anymore. This also removes
         * any huge TLB entry from the CPU so we won't allow
@@ -2402,16 +2438,16 @@ static void collapse_huge_page(struct mm_struct *mm,
         * to avoid the risk of CPU bugs in that area.
         */
        _pmd = pmdp_clear_flush(vma, address, pmd);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
-       spin_lock(ptl);
+       spin_lock(pte_ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
-       spin_unlock(ptl);
+       spin_unlock(pte_ptl);
 
        if (unlikely(!isolated)) {
                pte_unmap(pte);
-               spin_lock(&mm->page_table_lock);
+               spin_lock(pmd_ptl);
                BUG_ON(!pmd_none(*pmd));
                /*
                 * We can only use set_pmd_at when establishing
@@ -2419,7 +2455,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                 * points to regular pagetables. Use pmd_populate for that
                 */
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
                goto out;
        }
@@ -2430,7 +2466,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         */
        anon_vma_unlock_write(vma->anon_vma);
 
-       __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+       __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
        pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
@@ -2445,13 +2481,13 @@ static void collapse_huge_page(struct mm_struct *mm,
         */
        smp_wmb();
 
-       spin_lock(&mm->page_table_lock);
+       spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(pmd_ptl);
 
        *hpage = NULL;
 
@@ -2780,6 +2816,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
                pmd_t *pmd)
 {
+       spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -2792,22 +2829,22 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
        mmun_end   = haddr + HPAGE_PMD_SIZE;
 again:
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_trans_huge(*pmd))) {
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
                return;
        }
        if (is_huge_zero_pmd(*pmd)) {
                __split_huge_zero_page_pmd(vma, haddr, pmd);
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
                return;
        }
        page = pmd_page(*pmd);
        VM_BUG_ON(!page_count(page));
        get_page(page);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
        split_huge_page(page);
index 0b7656e..7d57af2 100644 (file)
@@ -2376,6 +2376,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+               spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr);
                if (!src_pte)
                        continue;
@@ -2387,8 +2388,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                if (dst_pte == src_pte)
                        continue;
 
-               spin_lock(&dst->page_table_lock);
-               spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
+               dst_ptl = huge_pte_lock(h, dst, dst_pte);
+               src_ptl = huge_pte_lockptr(h, src, src_pte);
+               spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                if (!huge_pte_none(huge_ptep_get(src_pte))) {
                        if (cow)
                                huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -2398,8 +2400,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        page_dup_rmap(ptepage);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                }
-               spin_unlock(&src->page_table_lock);
-               spin_unlock(&dst->page_table_lock);
+               spin_unlock(src_ptl);
+               spin_unlock(dst_ptl);
        }
        return 0;
 
@@ -2442,6 +2444,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        unsigned long address;
        pte_t *ptep;
        pte_t pte;
+       spinlock_t *ptl;
        struct page *page;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
@@ -2455,25 +2458,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        tlb_start_vma(tlb, vma);
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 again:
-       spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
 
+               ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(mm, &address, ptep))
-                       continue;
+                       goto unlock;
 
                pte = huge_ptep_get(ptep);
                if (huge_pte_none(pte))
-                       continue;
+                       goto unlock;
 
                /*
                 * HWPoisoned hugepage is already unmapped and dropped reference
                 */
                if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
                        huge_pte_clear(mm, address, ptep);
-                       continue;
+                       goto unlock;
                }
 
                page = pte_page(pte);
@@ -2484,7 +2487,7 @@ again:
                 */
                if (ref_page) {
                        if (page != ref_page)
-                               continue;
+                               goto unlock;
 
                        /*
                         * Mark the VMA as having unmapped its page so that
@@ -2501,13 +2504,18 @@ again:
 
                page_remove_rmap(page);
                force_flush = !__tlb_remove_page(tlb, page);
-               if (force_flush)
+               if (force_flush) {
+                       spin_unlock(ptl);
                        break;
+               }
                /* Bail out after unmapping reference page if supplied */
-               if (ref_page)
+               if (ref_page) {
+                       spin_unlock(ptl);
                        break;
+               }
+unlock:
+               spin_unlock(ptl);
        }
-       spin_unlock(&mm->page_table_lock);
        /*
         * mmu_gather ran out of room to batch pages, we break out of
         * the PTE lock to avoid doing the potential expensive TLB invalidate
@@ -2613,7 +2621,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, pte_t pte,
-                       struct page *pagecache_page)
+                       struct page *pagecache_page, spinlock_t *ptl)
 {
        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
@@ -2647,8 +2655,8 @@ retry_avoidcopy:
 
        page_cache_get(old_page);
 
-       /* Drop page_table_lock as buddy allocator may be called */
-       spin_unlock(&mm->page_table_lock);
+       /* Drop page table lock as buddy allocator may be called */
+       spin_unlock(ptl);
        new_page = alloc_huge_page(vma, address, outside_reserve);
 
        if (IS_ERR(new_page)) {
@@ -2666,13 +2674,13 @@ retry_avoidcopy:
                        BUG_ON(huge_pte_none(pte));
                        if (unmap_ref_private(mm, vma, old_page, address)) {
                                BUG_ON(huge_pte_none(pte));
-                               spin_lock(&mm->page_table_lock);
+                               spin_lock(ptl);
                                ptep = huge_pte_offset(mm, address & huge_page_mask(h));
                                if (likely(pte_same(huge_ptep_get(ptep), pte)))
                                        goto retry_avoidcopy;
                                /*
-                                * race occurs while re-acquiring page_table_lock, and
-                                * our job is done.
+                                * race occurs while re-acquiring page table
+                                * lock, and our job is done.
                                 */
                                return 0;
                        }
@@ -2680,7 +2688,7 @@ retry_avoidcopy:
                }
 
                /* Caller expects lock to be held */
-               spin_lock(&mm->page_table_lock);
+               spin_lock(ptl);
                if (err == -ENOMEM)
                        return VM_FAULT_OOM;
                else
@@ -2695,7 +2703,7 @@ retry_avoidcopy:
                page_cache_release(new_page);
                page_cache_release(old_page);
                /* Caller expects lock to be held */
-               spin_lock(&mm->page_table_lock);
+               spin_lock(ptl);
                return VM_FAULT_OOM;
        }
 
@@ -2707,10 +2715,10 @@ retry_avoidcopy:
        mmun_end = mmun_start + huge_page_size(h);
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
-        * Retake the page_table_lock to check for racing updates
+        * Retake the page table lock to check for racing updates
         * before the page tables are altered
         */
-       spin_lock(&mm->page_table_lock);
+       spin_lock(ptl);
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                ClearPagePrivate(new_page);
@@ -2724,13 +2732,13 @@ retry_avoidcopy:
                /* Make the old page be freed below */
                new_page = old_page;
        }
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        page_cache_release(new_page);
        page_cache_release(old_page);
 
        /* Caller expects lock to be held */
-       spin_lock(&mm->page_table_lock);
+       spin_lock(ptl);
        return 0;
 }
 
@@ -2778,6 +2786,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page;
        struct address_space *mapping;
        pte_t new_pte;
+       spinlock_t *ptl;
 
        /*
         * Currently, we are forced to kill the process in the event the
@@ -2864,7 +2873,8 @@ retry:
                        goto backout_unlocked;
                }
 
-       spin_lock(&mm->page_table_lock);
+       ptl = huge_pte_lockptr(h, mm, ptep);
+       spin_lock(ptl);
        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
                goto backout;
@@ -2885,16 +2895,16 @@ retry:
 
        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-               ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
+               ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
        }
 
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        unlock_page(page);
 out:
        return ret;
 
 backout:
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
 backout_unlocked:
        unlock_page(page);
        put_page(page);
@@ -2906,6 +2916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        pte_t *ptep;
        pte_t entry;
+       spinlock_t *ptl;
        int ret;
        struct page *page = NULL;
        struct page *pagecache_page = NULL;
@@ -2918,7 +2929,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (ptep) {
                entry = huge_ptep_get(ptep);
                if (unlikely(is_hugetlb_entry_migration(entry))) {
-                       migration_entry_wait_huge(mm, ptep);
+                       migration_entry_wait_huge(vma, mm, ptep);
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
@@ -2974,17 +2985,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (page != pagecache_page)
                lock_page(page);
 
-       spin_lock(&mm->page_table_lock);
+       ptl = huge_pte_lockptr(h, mm, ptep);
+       spin_lock(ptl);
        /* Check for a racing update before calling hugetlb_cow */
        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
-               goto out_page_table_lock;
+               goto out_ptl;
 
 
        if (flags & FAULT_FLAG_WRITE) {
                if (!huge_pte_write(entry)) {
                        ret = hugetlb_cow(mm, vma, address, ptep, entry,
-                                                       pagecache_page);
-                       goto out_page_table_lock;
+                                       pagecache_page, ptl);
+                       goto out_ptl;
                }
                entry = huge_pte_mkdirty(entry);
        }
@@ -2993,8 +3005,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                flags & FAULT_FLAG_WRITE))
                update_mmu_cache(vma, address, ptep);
 
-out_page_table_lock:
-       spin_unlock(&mm->page_table_lock);
+out_ptl:
+       spin_unlock(ptl);
 
        if (pagecache_page) {
                unlock_page(pagecache_page);
@@ -3020,9 +3032,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long remainder = *nr_pages;
        struct hstate *h = hstate_vma(vma);
 
-       spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
+               spinlock_t *ptl = NULL;
                int absent;
                struct page *page;
 
@@ -3030,8 +3042,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * Some archs (sparc64, sh*) have multiple pte_ts to
                 * each hugepage.  We have to make sure we get the
                 * first, for the page indexing below to work.
+                *
+                * Note that page table lock is not held when pte is null.
                 */
                pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+               if (pte)
+                       ptl = huge_pte_lock(h, mm, pte);
                absent = !pte || huge_pte_none(huge_ptep_get(pte));
 
                /*
@@ -3043,6 +3059,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (absent && (flags & FOLL_DUMP) &&
                    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+                       if (pte)
+                               spin_unlock(ptl);
                        remainder = 0;
                        break;
                }
@@ -3062,10 +3080,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                      !huge_pte_write(huge_ptep_get(pte)))) {
                        int ret;
 
-                       spin_unlock(&mm->page_table_lock);
+                       if (pte)
+                               spin_unlock(ptl);
                        ret = hugetlb_fault(mm, vma, vaddr,
                                (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
-                       spin_lock(&mm->page_table_lock);
                        if (!(ret & VM_FAULT_ERROR))
                                continue;
 
@@ -3096,8 +3114,8 @@ same_page:
                         */
                        goto same_page;
                }
+               spin_unlock(ptl);
        }
-       spin_unlock(&mm->page_table_lock);
        *nr_pages = remainder;
        *position = vaddr;
 
@@ -3118,13 +3136,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
        flush_cache_range(vma, address, end);
 
        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
-       spin_lock(&mm->page_table_lock);
        for (; address < end; address += huge_page_size(h)) {
+               spinlock_t *ptl;
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
+               ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(mm, &address, ptep)) {
                        pages++;
+                       spin_unlock(ptl);
                        continue;
                }
                if (!huge_pte_none(huge_ptep_get(ptep))) {
@@ -3134,8 +3154,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                        set_huge_pte_at(mm, address, ptep, pte);
                        pages++;
                }
+               spin_unlock(ptl);
        }
-       spin_unlock(&mm->page_table_lock);
        /*
         * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
         * may have cleared our pud entry and done put_page on the page table:
@@ -3298,6 +3318,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        unsigned long saddr;
        pte_t *spte = NULL;
        pte_t *pte;
+       spinlock_t *ptl;
 
        if (!vma_shareable(vma, addr))
                return (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3320,13 +3341,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (!spte)
                goto out;
 
-       spin_lock(&mm->page_table_lock);
+       ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
+       spin_lock(ptl);
        if (pud_none(*pud))
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));
        else
                put_page(virt_to_page(spte));
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
 out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -3340,7 +3362,7 @@ out:
  * indicated by page_count > 1, unmap is achieved by clearing pud and
  * decrementing the ref count. If count == 1, the pte page is not shared.
  *
- * called with vma->vm_mm->page_table_lock held.
+ * called with page table lock held.
  *
  * returns: 1 successfully unmapped a shared pte page
  *         0 the underlying pte page is not shared, or it is the last user
index e3cd40b..f1a0ae6 100644 (file)
@@ -6605,10 +6605,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (pmd_trans_huge_lock(pmd, vma) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                        mc.precharge += HPAGE_PMD_NR;
-               spin_unlock(&vma->vm_mm->page_table_lock);
+               spin_unlock(ptl);
                return 0;
        }
 
@@ -6797,9 +6797,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
         *    to be unlocked in __split_huge_page_splitting(), where the main
         *    part of thp split is not executed yet.
         */
-       if (pmd_trans_huge_lock(pmd, vma) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                if (mc.precharge < HPAGE_PMD_NR) {
-                       spin_unlock(&vma->vm_mm->page_table_lock);
+                       spin_unlock(ptl);
                        return 0;
                }
                target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6816,7 +6816,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                        }
                        put_page(page);
                }
-               spin_unlock(&vma->vm_mm->page_table_lock);
+               spin_unlock(ptl);
                return 0;
        }
 
index f9d78ec..b7c1716 100644 (file)
@@ -1269,7 +1269,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags)
 
        mf_cpu = &get_cpu_var(memory_failure_cpu);
        spin_lock_irqsave(&mf_cpu->lock, proc_flags);
-       if (kfifo_put(&mf_cpu->fifo, &entry))
+       if (kfifo_put(&mf_cpu->fifo, entry))
                schedule_work_on(smp_processor_id(), &mf_cpu->work);
        else
                pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
index bf86658..0409e8f 100644 (file)
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
-       tlb->mm->nr_ptes--;
+       atomic_long_dec(&tlb->mm->nr_ptes);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -550,6 +550,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                pmd_t *pmd, unsigned long address)
 {
+       spinlock_t *ptl;
        pgtable_t new = pte_alloc_one(mm, address);
        int wait_split_huge_page;
        if (!new)
@@ -570,15 +571,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmd);
        wait_split_huge_page = 0;
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
-               mm->nr_ptes++;
+               atomic_long_inc(&mm->nr_ptes);
                pmd_populate(mm, pmd, new);
                new = NULL;
        } else if (unlikely(pmd_trans_splitting(*pmd)))
                wait_split_huge_page = 1;
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        if (new)
                pte_free(mm, new);
        if (wait_split_huge_page)
@@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
                        split_huge_page_pmd(vma, address, pmd);
                        goto split_fallthrough;
                }
-               spin_lock(&mm->page_table_lock);
+               ptl = pmd_lock(mm, pmd);
                if (likely(pmd_trans_huge(*pmd))) {
                        if (unlikely(pmd_trans_splitting(*pmd))) {
-                               spin_unlock(&mm->page_table_lock);
+                               spin_unlock(ptl);
                                wait_split_huge_page(vma->anon_vma, pmd);
                        } else {
                                page = follow_trans_huge_pmd(vma, address,
                                                             pmd, flags);
-                               spin_unlock(&mm->page_table_lock);
+                               spin_unlock(ptl);
                                *page_mask = HPAGE_PMD_NR - 1;
                                goto out;
                        }
                } else
-                       spin_unlock(&mm->page_table_lock);
+                       spin_unlock(ptl);
                /* fall through */
        }
 split_fallthrough:
@@ -4269,3 +4270,28 @@ void copy_user_huge_page(struct page *dst, struct page *src,
        }
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
+#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
+static struct kmem_cache *page_ptl_cachep;
+void __init ptlock_cache_init(void)
+{
+       page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+                       SLAB_PANIC, NULL);
+}
+
+bool ptlock_alloc(struct page *page)
+{
+       spinlock_t *ptl;
+
+       ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+       if (!ptl)
+               return false;
+       page->ptl = ptl;
+       return true;
+}
+
+void ptlock_free(struct page *page)
+{
+       kfree(page->ptl);
+}
+#endif
index 4cc19f6..c4403cd 100644 (file)
@@ -525,8 +525,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 #ifdef CONFIG_HUGETLB_PAGE
        int nid;
        struct page *page;
+       spinlock_t *ptl;
 
-       spin_lock(&vma->vm_mm->page_table_lock);
+       ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
        page = pte_page(huge_ptep_get((pte_t *)pmd));
        nid = page_to_nid(page);
        if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -536,7 +537,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
                isolate_huge_page(page, private);
 unlock:
-       spin_unlock(&vma->vm_mm->page_table_lock);
+       spin_unlock(ptl);
 #else
        BUG();
 #endif
index dfc8300..316e720 100644 (file)
@@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                ptep = huge_pte_offset(mm, addr);
                if (!ptep)
                        goto out;
-               ptl = &mm->page_table_lock;
+               ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
        } else {
                pmd = mm_find_pmd(mm, addr);
                if (!pmd)
@@ -249,9 +249,10 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
        __migration_entry_wait(mm, ptep, ptl);
 }
 
-void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
+void migration_entry_wait_huge(struct vm_area_struct *vma,
+               struct mm_struct *mm, pte_t *pte)
 {
-       spinlock_t *ptl = &(mm)->page_table_lock;
+       spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
        __migration_entry_wait(mm, pte, ptl);
 }
 
@@ -1666,6 +1667,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                                unsigned long address,
                                struct page *page, int node)
 {
+       spinlock_t *ptl;
        unsigned long haddr = address & HPAGE_PMD_MASK;
        pg_data_t *pgdat = NODE_DATA(node);
        int isolated = 0;
@@ -1705,9 +1707,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        WARN_ON(PageLRU(new_page));
 
        /* Recheck the target PMD */
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_same(*pmd, entry))) {
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
 
                /* Reverse changes made by migrate_page_copy() */
                if (TestClearPageActive(new_page))
@@ -1752,7 +1754,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
         * before it's fully transferred to the new page.
         */
        mem_cgroup_end_migration(memcg, page, new_page, true);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
 
        unlock_page(new_page);
        unlock_page(page);
index 5a6badd..834b2d7 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2724,7 +2724,8 @@ void exit_mmap(struct mm_struct *mm)
        }
        vm_unacct_memory(nr_accounted);
 
-       WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+       WARN_ON(atomic_long_read(&mm->nr_ptes) >
+                       (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
index 6738c47..1e4a600 100644 (file)
@@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
-       points = get_mm_rss(p->mm) + p->mm->nr_ptes +
+       points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
                 get_mm_counter(p->mm, MM_SWAPENTS);
        task_unlock(p);
 
@@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
                        continue;
                }
 
-               pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5hd %s\n",
+               pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n",
                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-                       task->mm->nr_ptes,
+                       atomic_long_read(&task->mm->nr_ptes),
                        get_mm_counter(task->mm, MM_SWAPENTS),
                        task->signal->oom_score_adj, task->comm);
                task_unlock(task);
index 3929a40..cbb3854 100644 (file)
@@ -151,14 +151,14 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
 {
-       assert_spin_locked(&mm->page_table_lock);
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
 
        /* FIFO */
-       if (!mm->pmd_huge_pte)
+       if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(&pgtable->lru);
        else
-               list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-       mm->pmd_huge_pte = pgtable;
+               list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
+       pmd_huge_pte(mm, pmdp) = pgtable;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
@@ -170,14 +170,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
        pgtable_t pgtable;
 
-       assert_spin_locked(&mm->page_table_lock);
+       assert_spin_locked(pmd_lockptr(mm, pmdp));
 
        /* FIFO */
-       pgtable = mm->pmd_huge_pte;
+       pgtable = pmd_huge_pte(mm, pmdp);
        if (list_empty(&pgtable->lru))
-               mm->pmd_huge_pte = NULL;
+               pmd_huge_pte(mm, pmdp) = NULL;
        else {
-               mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+               pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
                                              struct page, lru);
                list_del(&pgtable->lru);
        }
index fd3ee7a..55c8b8d 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
 
        if (unlikely(PageHuge(page))) {
                pte = huge_pte_offset(mm, address);
-               ptl = &mm->page_table_lock;
+               ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
                goto check;
        }
 
@@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
+       spinlock_t *ptl;
        int referenced = 0;
 
        if (unlikely(PageTransHuge(page))) {
                pmd_t *pmd;
 
-               spin_lock(&mm->page_table_lock);
                /*
                 * rmap might return false positives; we must filter
                 * these out using page_check_address_pmd().
                 */
                pmd = page_check_address_pmd(page, mm, address,
-                                            PAGE_CHECK_ADDRESS_PMD_FLAG);
-               if (!pmd) {
-                       spin_unlock(&mm->page_table_lock);
+                                            PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+               if (!pmd)
                        goto out;
-               }
 
                if (vma->vm_flags & VM_LOCKED) {
-                       spin_unlock(&mm->page_table_lock);
+                       spin_unlock(ptl);
                        *mapcount = 0;  /* break early from loop */
                        *vm_flags |= VM_LOCKED;
                        goto out;
@@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                /* go ahead even if the pmd is pmd_trans_splitting() */
                if (pmdp_clear_flush_young_notify(vma, address, pmd))
                        referenced++;
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
        } else {
                pte_t *pte;
-               spinlock_t *ptl;
 
                /*
                 * rmap might return false positives; we must filter
index 990afab..9c5a1aa 100644 (file)
@@ -544,9 +544,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 
        chan->inuse = false;
        if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) {
-               vdev->config->get(vdev,
-                               offsetof(struct virtio_9p_config, tag_len),
-                               &tag_len, sizeof(tag_len));
+               virtio_cread(vdev, struct virtio_9p_config, tag_len, &tag_len);
        } else {
                err = -EINVAL;
                goto out_free_vq;
@@ -556,8 +554,9 @@ static int p9_virtio_probe(struct virtio_device *vdev)
                err = -ENOMEM;
                goto out_free_vq;
        }
-       vdev->config->get(vdev, offsetof(struct virtio_9p_config, tag),
-                       tag, tag_len);
+
+       virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag),
+                          tag, tag_len);
        chan->tag = tag;
        chan->tag_len = tag_len;
        err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
index ec9a9ef..5afeb5a 100644 (file)
@@ -2523,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
                        const struct fib_info *fi = fa->fa_info;
                        unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
-                       int len;
 
                        if (fa->fa_type == RTN_BROADCAST
                            || fa->fa_type == RTN_MULTICAST)
                                continue;
 
+                       seq_setwidth(seq, 127);
+
                        if (fi)
                                seq_printf(seq,
                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
-                                        "%d\t%08X\t%d\t%u\t%u%n",
+                                        "%d\t%08X\t%d\t%u\t%u",
                                         fi->fib_dev ? fi->fib_dev->name : "*",
                                         prefix,
                                         fi->fib_nh->nh_gw, flags, 0, 0,
@@ -2541,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
                                         (fi->fib_advmss ?
                                          fi->fib_advmss + 40 : 0),
                                         fi->fib_window,
-                                        fi->fib_rtt >> 3, &len);
+                                        fi->fib_rtt >> 3);
                        else
                                seq_printf(seq,
                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
-                                        "%d\t%08X\t%d\t%u\t%u%n",
+                                        "%d\t%08X\t%d\t%u\t%u",
                                         prefix, 0, flags, 0, 0, 0,
-                                        mask, 0, 0, 0, &len);
+                                        mask, 0, 0, 0);
 
-                       seq_printf(seq, "%*s\n", 127 - len, "");
+                       seq_pad(seq, '\n');
                }
        }
 
index 9afbdb1..cbc85f6 100644 (file)
@@ -1076,7 +1076,7 @@ void ping_seq_stop(struct seq_file *seq, void *v)
 EXPORT_SYMBOL_GPL(ping_seq_stop);
 
 static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
-               int bucket, int *len)
+               int bucket)
 {
        struct inet_sock *inet = inet_sk(sp);
        __be32 dest = inet->inet_daddr;
@@ -1085,7 +1085,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
        __u16 srcp = ntohs(inet->inet_sport);
 
        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-               " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
+               " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
@@ -1093,23 +1093,22 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
                0, sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp,
-               atomic_read(&sp->sk_drops), len);
+               atomic_read(&sp->sk_drops));
 }
 
 static int ping_v4_seq_show(struct seq_file *seq, void *v)
 {
+       seq_setwidth(seq, 127);
        if (v == SEQ_START_TOKEN)
-               seq_printf(seq, "%-127s\n",
-                          "  sl  local_address rem_address   st tx_queue "
+               seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
                           "rx_queue tr tm->when retrnsmt   uid  timeout "
                           "inode ref pointer drops");
        else {
                struct ping_iter_state *state = seq->private;
-               int len;
 
-               ping_v4_format_sock(v, seq, state->bucket, &len);
-               seq_printf(seq, "%*s\n", 127 - len, "");
+               ping_v4_format_sock(v, seq, state->bucket);
        }
+       seq_pad(seq, '\n');
        return 0;
 }
 
index 14bba8a..59a6f8b 100644 (file)
@@ -2541,13 +2541,13 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
 EXPORT_SYMBOL(tcp_proc_unregister);
 
 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
-                        struct seq_file *f, int i, kuid_t uid, int *len)
+                        struct seq_file *f, int i, kuid_t uid)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        long delta = req->expires - jiffies;
 
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-               " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
+               " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
                i,
                ireq->ir_loc_addr,
                ntohs(inet_sk(sk)->inet_sport),
@@ -2562,11 +2562,10 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
                0,  /* non standard timer */
                0, /* open_requests have no inode */
                atomic_read(&sk->sk_refcnt),
-               req,
-               len);
+               req);
 }
 
-static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 {
        int timer_active;
        unsigned long timer_expires;
@@ -2605,7 +2604,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
 
        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
-                       "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
+                       "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
                i, src, srcp, dest, destp, sk->sk_state,
                tp->write_seq - tp->snd_una,
                rx_queue,
@@ -2622,12 +2621,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                tp->snd_cwnd,
                sk->sk_state == TCP_LISTEN ?
                    (fastopenq ? fastopenq->max_qlen : 0) :
-                   (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
-               len);
+                   (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
 }
 
 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
-                              struct seq_file *f, int i, int *len)
+                              struct seq_file *f, int i)
 {
        __be32 dest, src;
        __u16 destp, srcp;
@@ -2639,10 +2637,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
        srcp  = ntohs(tw->tw_sport);
 
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-               " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
+               " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
-               atomic_read(&tw->tw_refcnt), tw, len);
+               atomic_read(&tw->tw_refcnt), tw);
 }
 
 #define TMPSZ 150
@@ -2651,11 +2649,10 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
 {
        struct tcp_iter_state *st;
        struct sock *sk = v;
-       int len;
 
+       seq_setwidth(seq, TMPSZ - 1);
        if (v == SEQ_START_TOKEN) {
-               seq_printf(seq, "%-*s\n", TMPSZ - 1,
-                          "  sl  local_address rem_address   st tx_queue "
+               seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
                           "rx_queue tr tm->when retrnsmt   uid  timeout "
                           "inode");
                goto out;
@@ -2666,16 +2663,16 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
        case TCP_SEQ_STATE_LISTENING:
        case TCP_SEQ_STATE_ESTABLISHED:
                if (sk->sk_state == TCP_TIME_WAIT)
-                       get_timewait4_sock(v, seq, st->num, &len);
+                       get_timewait4_sock(v, seq, st->num);
                else
-                       get_tcp4_sock(v, seq, st->num, &len);
+                       get_tcp4_sock(v, seq, st->num);
                break;
        case TCP_SEQ_STATE_OPENREQ:
-               get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
+               get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
                break;
        }
-       seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
 out:
+       seq_pad(seq, '\n');
        return 0;
 }
 
index 89909dd..de86e5b 100644 (file)
@@ -2331,7 +2331,7 @@ EXPORT_SYMBOL(udp_proc_unregister);
 
 /* ------------------------------------------------------------------------ */
 static void udp4_format_sock(struct sock *sp, struct seq_file *f,
-               int bucket, int *len)
+               int bucket)
 {
        struct inet_sock *inet = inet_sk(sp);
        __be32 dest = inet->inet_daddr;
@@ -2340,7 +2340,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
        __u16 srcp        = ntohs(inet->inet_sport);
 
        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-               " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
+               " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
@@ -2348,23 +2348,22 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
                0, sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp,
-               atomic_read(&sp->sk_drops), len);
+               atomic_read(&sp->sk_drops));
 }
 
 int udp4_seq_show(struct seq_file *seq, void *v)
 {
+       seq_setwidth(seq, 127);
        if (v == SEQ_START_TOKEN)
-               seq_printf(seq, "%-127s\n",
-                          "  sl  local_address rem_address   st tx_queue "
+               seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
                           "rx_queue tr tm->when retrnsmt   uid  timeout "
                           "inode ref pointer drops");
        else {
                struct udp_iter_state *state = seq->private;
-               int len;
 
-               udp4_format_sock(v, seq, state->bucket, &len);
-               seq_printf(seq, "%*s\n", 127 - len, "");
+               udp4_format_sock(v, seq, state->bucket);
        }
+       seq_pad(seq, '\n');
        return 0;
 }
 
index 77e38f7..008214a 100644 (file)
@@ -595,26 +595,25 @@ static void pn_sock_seq_stop(struct seq_file *seq, void *v)
 
 static int pn_sock_seq_show(struct seq_file *seq, void *v)
 {
-       int len;
-
+       seq_setwidth(seq, 127);
        if (v == SEQ_START_TOKEN)
-               seq_printf(seq, "%s%n", "pt  loc  rem rs st tx_queue rx_queue "
-                       "  uid inode ref pointer drops", &len);
+               seq_puts(seq, "pt  loc  rem rs st tx_queue rx_queue "
+                       "  uid inode ref pointer drops");
        else {
                struct sock *sk = v;
                struct pn_sock *pn = pn_sk(sk);
 
                seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu "
-                       "%d %pK %d%n",
+                       "%d %pK %d",
                        sk->sk_protocol, pn->sobject, pn->dobject,
                        pn->resource, sk->sk_state,
                        sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk),
                        from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
                        sock_i_ino(sk),
                        atomic_read(&sk->sk_refcnt), sk,
-                       atomic_read(&sk->sk_drops), &len);
+                       atomic_read(&sk->sk_drops));
        }
-       seq_printf(seq, "%*s\n", 127 - len, "");
+       seq_pad(seq, '\n');
        return 0;
 }
 
@@ -785,20 +784,19 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v)
 
 static int pn_res_seq_show(struct seq_file *seq, void *v)
 {
-       int len;
-
+       seq_setwidth(seq, 63);
        if (v == SEQ_START_TOKEN)
-               seq_printf(seq, "%s%n", "rs   uid inode", &len);
+               seq_puts(seq, "rs   uid inode");
        else {
                struct sock **psk = v;
                struct sock *sk = *psk;
 
-               seq_printf(seq, "%02X %5u %lu%n",
+               seq_printf(seq, "%02X %5u %lu",
                           (int) (psk - pnres.sk),
                           from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
-                          sock_i_ino(sk), &len);
+                          sock_i_ino(sk));
        }
-       seq_printf(seq, "%*s\n", 63 - len, "");
+       seq_pad(seq, '\n');
        return 0;
 }
 
index 5ea573b..647396b 100644 (file)
@@ -79,12 +79,13 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
  */
 static int sctp_objcnt_seq_show(struct seq_file *seq, void *v)
 {
-       int i, len;
+       int i;
 
        i = (int)*(loff_t *)v;
-       seq_printf(seq, "%s: %d%n", sctp_dbg_objcnt[i].label,
-                               atomic_read(sctp_dbg_objcnt[i].counter), &len);
-       seq_printf(seq, "%*s\n", 127 - len, "");
+       seq_setwidth(seq, 127);
+       seq_printf(seq, "%s: %d", sctp_dbg_objcnt[i].label,
+                               atomic_read(sctp_dbg_objcnt[i].counter));
+       seq_pad(seq, '\n');
        return 0;
 }
 
index cfe40ad..2fca916 100644 (file)
@@ -64,7 +64,7 @@ static int __init testfunc(void)
 
        /* put values into the fifo */
        for (i = 0; i != 10; i++)
-               kfifo_put(&test, &i);
+               kfifo_put(&test, i);
 
        /* show the number of used elements */
        printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
@@ -85,7 +85,7 @@ static int __init testfunc(void)
        kfifo_skip(&test);
 
        /* put values into the fifo until is full */
-       for (i = 20; kfifo_put(&test, &i); i++)
+       for (i = 20; kfifo_put(&test, i); i++)
                ;
 
        printk(KERN_INFO "queue len: %u\n", kfifo_len(&test));
index 0647379..aa243db 100644 (file)
@@ -39,7 +39,7 @@ static int __init example_init(void)
        kfifo_in(&fifo, "test", 4);
 
        for (i = 0; i != 9; i++)
-               kfifo_put(&fifo, &i);
+               kfifo_put(&fifo, i);
 
        /* kick away first byte */
        kfifo_skip(&fifo);
index 6f8e79e..8dc3c2e 100644 (file)
@@ -61,7 +61,7 @@ static int __init testfunc(void)
 
        /* put values into the fifo */
        for (i = 0; i != 10; i++)
-               kfifo_put(&test, &i);
+               kfifo_put(&test, i);
 
        /* show the number of used elements */
        printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
@@ -78,7 +78,7 @@ static int __init testfunc(void)
        kfifo_skip(&test);
 
        /* put values into the fifo until is full */
-       for (i = 20; kfifo_put(&test, &i); i++)
+       for (i = 20; kfifo_put(&test, i); i++)
                ;
 
        printk(KERN_INFO "queue len: %u\n", kfifo_len(&test));
index 8dcdca2..69f0a14 100644 (file)
@@ -79,9 +79,11 @@ modpost = scripts/mod/modpost                    \
  $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S)      \
  $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w)
 
+MODPOST_OPT=$(subst -i,-n,$(filter -i,$(MAKEFLAGS)))
+
 # We can go over command line length here, so be careful.
 quiet_cmd_modpost = MODPOST $(words $(filter-out vmlinux FORCE, $^)) modules
-      cmd_modpost = $(MODLISTCMD) | sed 's/\.ko$$/.o/' | $(modpost) -s -T -
+      cmd_modpost = $(MODLISTCMD) | sed 's/\.ko$$/.o/' | $(modpost) $(MODPOST_OPT) -s -T -
 
 PHONY += __modpost
 __modpost: $(modules:.ko=.o) FORCE
index bfcea5d..1785576 100644 (file)
@@ -17,6 +17,7 @@
 #include <string.h>
 #include <limits.h>
 #include <stdbool.h>
+#include <errno.h>
 #include "modpost.h"
 #include "../../include/generated/autoconf.h"
 #include "../../include/linux/license.h"
@@ -37,6 +38,8 @@ static int warn_unresolved = 0;
 /* How a symbol is exported */
 static int sec_mismatch_count = 0;
 static int sec_mismatch_verbose = 1;
+/* ignore missing files */
+static int ignore_missing_files;
 
 enum export {
        export_plain,      export_unused,     export_gpl,
@@ -161,7 +164,7 @@ struct symbol {
        unsigned int vmlinux:1;    /* 1 if symbol is defined in vmlinux */
        unsigned int kernel:1;     /* 1 if symbol is from kernel
                                    *  (only for external modules) **/
-       unsigned int preloaded:1;  /* 1 if symbol from Module.symvers */
+       unsigned int preloaded:1;  /* 1 if symbol from Module.symvers, or crc */
        enum export  export;       /* Type of export */
        char name[0];
 };
@@ -329,8 +332,11 @@ static void sym_update_crc(const char *name, struct module *mod,
 {
        struct symbol *s = find_symbol(name);
 
-       if (!s)
+       if (!s) {
                s = new_symbol(name, mod, export);
+               /* Don't complain when we find it later. */
+               s->preloaded = 1;
+       }
        s->crc = crc;
        s->crc_valid = 1;
 }
@@ -407,6 +413,11 @@ static int parse_elf(struct elf_info *info, const char *filename)
 
        hdr = grab_file(filename, &info->size);
        if (!hdr) {
+               if (ignore_missing_files) {
+                       fprintf(stderr, "%s: %s (ignored)\n", filename,
+                               strerror(errno));
+                       return 0;
+               }
                perror(filename);
                exit(1);
        }
@@ -1852,7 +1863,7 @@ static void add_header(struct buffer *b, struct module *mod)
        buf_printf(b, "\n");
        buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
        buf_printf(b, "\n");
-       buf_printf(b, "struct module __this_module\n");
+       buf_printf(b, "__visible struct module __this_module\n");
        buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n");
        buf_printf(b, "\t.name = KBUILD_MODNAME,\n");
        if (mod->has_init)
@@ -2118,7 +2129,7 @@ int main(int argc, char **argv)
        struct ext_sym_list *extsym_iter;
        struct ext_sym_list *extsym_start = NULL;
 
-       while ((opt = getopt(argc, argv, "i:I:e:msST:o:awM:K:")) != -1) {
+       while ((opt = getopt(argc, argv, "i:I:e:mnsST:o:awM:K:")) != -1) {
                switch (opt) {
                case 'i':
                        kernel_read = optarg;
@@ -2138,6 +2149,9 @@ int main(int argc, char **argv)
                case 'm':
                        modversions = 1;
                        break;
+               case 'n':
+                       ignore_missing_files = 1;
+                       break;
                case 'o':
                        dump_write = optarg;
                        break;
index 9d93f02..5e1c7bc 100644 (file)
@@ -184,11 +184,7 @@ static void snd_malloc_dev_iram(struct snd_dma_buffer *dmab, size_t size)
        /* Assign the pool into private_data field */
        dmab->private_data = pool;
 
-       dmab->area = (void *)gen_pool_alloc(pool, size);
-       if (!dmab->area)
-               return;
-
-       dmab->addr = gen_pool_virt_to_phys(pool, (unsigned long)dmab->area);
+       dmab->area = gen_pool_dma_alloc(pool, size, &dmab->addr);
 }
 
 /**
index 6feee66..57bcd31 100644 (file)
@@ -543,7 +543,7 @@ static int dice_change_rate(struct dice *dice, unsigned int clock_rate)
        __be32 value;
        int err;
 
-       INIT_COMPLETION(dice->clock_accepted);
+       reinit_completion(&dice->clock_accepted);
 
        value = cpu_to_be32(clock_rate | CLOCK_SOURCE_ARX1);
        err = snd_fw_transaction(dice->unit, TCODE_WRITE_QUADLET_REQUEST,
index 2acf987..350ba23 100644 (file)
@@ -74,7 +74,7 @@ static void s3c_ac97_activate(struct snd_ac97 *ac97)
        if (stat == S3C_AC97_GLBSTAT_MAINSTATE_ACTIVE)
                return; /* Return if already active */
 
-       INIT_COMPLETION(s3c_ac97.done);
+       reinit_completion(&s3c_ac97.done);
 
        ac_glbctrl = readl(s3c_ac97.regs + S3C_AC97_GLBCTRL);
        ac_glbctrl = S3C_AC97_GLBCTRL_ACLINKON;
@@ -103,7 +103,7 @@ static unsigned short s3c_ac97_read(struct snd_ac97 *ac97,
 
        s3c_ac97_activate(ac97);
 
-       INIT_COMPLETION(s3c_ac97.done);
+       reinit_completion(&s3c_ac97.done);
 
        ac_codec_cmd = readl(s3c_ac97.regs + S3C_AC97_CODEC_CMD);
        ac_codec_cmd = S3C_AC97_CODEC_CMD_READ | AC_CMD_ADDR(reg);
@@ -140,7 +140,7 @@ static void s3c_ac97_write(struct snd_ac97 *ac97, unsigned short reg,
 
        s3c_ac97_activate(ac97);
 
-       INIT_COMPLETION(s3c_ac97.done);
+       reinit_completion(&s3c_ac97.done);
 
        ac_codec_cmd = readl(s3c_ac97.regs + S3C_AC97_CODEC_CMD);
        ac_codec_cmd = AC_CMD_ADDR(reg) | AC_CMD_DATA(val);
index da7a195..bdb71a2 100644 (file)
@@ -41,13 +41,14 @@ struct vdev_info {
        struct vhost_memory *mem;
 };
 
-void vq_notify(struct virtqueue *vq)
+bool vq_notify(struct virtqueue *vq)
 {
        struct vq_info *info = vq->priv;
        unsigned long long v = 1;
        int r;
        r = write(info->kick, &v, sizeof v);
        assert(r == sizeof v);
+       return true;
 }
 
 void vq_callback(struct virtqueue *vq)
@@ -171,7 +172,8 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq,
                                                         GFP_ATOMIC);
                                if (likely(r == 0)) {
                                        ++started;
-                                       virtqueue_kick(vq->vq);
+                                       if (unlikely(!virtqueue_kick(vq->vq))
+                                               r = -1;
                                }
                        } else
                                r = -1;
index d053ea4..14a4f4c 100644 (file)
@@ -22,7 +22,7 @@ static u64 user_addr_offset;
 #define RINGSIZE 256
 #define ALIGN 4096
 
-static void never_notify_host(struct virtqueue *vq)
+static bool never_notify_host(struct virtqueue *vq)
 {
        abort();
 }
@@ -65,17 +65,22 @@ struct guest_virtio_device {
        unsigned long notifies;
 };
 
-static void parallel_notify_host(struct virtqueue *vq)
+static bool parallel_notify_host(struct virtqueue *vq)
 {
+       int rc;
        struct guest_virtio_device *gvdev;
 
        gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev);
-       write(gvdev->to_host_fd, "", 1);
+       rc = write(gvdev->to_host_fd, "", 1);
+       if (rc < 0)
+               return false;
        gvdev->notifies++;
+       return true;
 }
 
-static void no_notify_host(struct virtqueue *vq)
+static bool no_notify_host(struct virtqueue *vq)
 {
+       return true;
 }
 
 #define NUM_XFERS (10000000)
index 779262f..fbe1a48 100644 (file)
@@ -27,3 +27,6 @@ config HAVE_KVM_MSI
 
 config HAVE_KVM_CPU_RELAX_INTERCEPT
        bool
+
+config KVM_VFIO
+       bool
index 8a39dda..8631d9c 100644 (file)
@@ -56,7 +56,6 @@ void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
 
 static void async_pf_execute(struct work_struct *work)
 {
-       struct page *page = NULL;
        struct kvm_async_pf *apf =
                container_of(work, struct kvm_async_pf, work);
        struct mm_struct *mm = apf->mm;
@@ -68,14 +67,12 @@ static void async_pf_execute(struct work_struct *work)
 
        use_mm(mm);
        down_read(&mm->mmap_sem);
-       get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
+       get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL);
        up_read(&mm->mmap_sem);
        unuse_mm(mm);
 
        spin_lock(&vcpu->async_pf.lock);
        list_add_tail(&apf->link, &vcpu->async_pf.done);
-       apf->page = page;
-       apf->done = true;
        spin_unlock(&vcpu->async_pf.lock);
 
        /*
@@ -83,7 +80,7 @@ static void async_pf_execute(struct work_struct *work)
         * this point
         */
 
-       trace_kvm_async_pf_completed(addr, page, gva);
+       trace_kvm_async_pf_completed(addr, gva);
 
        if (waitqueue_active(&vcpu->wq))
                wake_up_interruptible(&vcpu->wq);
@@ -99,9 +96,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
                struct kvm_async_pf *work =
                        list_entry(vcpu->async_pf.queue.next,
                                   typeof(*work), queue);
-               cancel_work_sync(&work->work);
                list_del(&work->queue);
-               if (!work->done) { /* work was canceled */
+               if (cancel_work_sync(&work->work)) {
                        mmdrop(work->mm);
                        kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
                        kmem_cache_free(async_pf_cache, work);
@@ -114,8 +110,6 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
                        list_entry(vcpu->async_pf.done.next,
                                   typeof(*work), link);
                list_del(&work->link);
-               if (!is_error_page(work->page))
-                       kvm_release_page_clean(work->page);
                kmem_cache_free(async_pf_cache, work);
        }
        spin_unlock(&vcpu->async_pf.lock);
@@ -135,14 +129,11 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
                list_del(&work->link);
                spin_unlock(&vcpu->async_pf.lock);
 
-               if (work->page)
-                       kvm_arch_async_page_ready(vcpu, work);
+               kvm_arch_async_page_ready(vcpu, work);
                kvm_arch_async_page_present(vcpu, work);
 
                list_del(&work->queue);
                vcpu->async_pf.queued--;
-               if (!is_error_page(work->page))
-                       kvm_release_page_clean(work->page);
                kmem_cache_free(async_pf_cache, work);
        }
 }
@@ -165,8 +156,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
        if (!work)
                return 0;
 
-       work->page = NULL;
-       work->done = false;
+       work->wakeup_all = false;
        work->vcpu = vcpu;
        work->gva = gva;
        work->addr = gfn_to_hva(vcpu->kvm, gfn);
@@ -206,7 +196,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
        if (!work)
                return -ENOMEM;
 
-       work->page = KVM_ERR_PTR_BAD_PAGE;
+       work->wakeup_all = true;
        INIT_LIST_HEAD(&work->queue); /* for list_del to work */
 
        spin_lock(&vcpu->async_pf.lock);
index 72a130b..0df7d4b 100644 (file)
@@ -79,7 +79,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
        flags = IOMMU_READ;
        if (!(slot->flags & KVM_MEM_READONLY))
                flags |= IOMMU_WRITE;
-       if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
+       if (!kvm->arch.iommu_noncoherent)
                flags |= IOMMU_CACHE;
 
 
@@ -103,6 +103,10 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
                while ((gfn << PAGE_SHIFT) & (page_size - 1))
                        page_size >>= 1;
 
+               /* Make sure hva is aligned to the page size we want to map */
+               while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
+                       page_size >>= 1;
+
                /*
                 * Pin all pages we are about to map in memory. This is
                 * important because we unmap and unpin in 4kb steps later.
@@ -140,6 +144,9 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
 
+       if (kvm->arch.iommu_noncoherent)
+               kvm_arch_register_noncoherent_dma(kvm);
+
        idx = srcu_read_lock(&kvm->srcu);
        slots = kvm_memslots(kvm);
 
@@ -158,7 +165,8 @@ int kvm_assign_device(struct kvm *kvm,
 {
        struct pci_dev *pdev = NULL;
        struct iommu_domain *domain = kvm->arch.iommu_domain;
-       int r, last_flags;
+       int r;
+       bool noncoherent;
 
        /* check if iommu exists and in use */
        if (!domain)
@@ -174,15 +182,13 @@ int kvm_assign_device(struct kvm *kvm,
                return r;
        }
 
-       last_flags = kvm->arch.iommu_flags;
-       if (iommu_domain_has_cap(kvm->arch.iommu_domain,
-                                IOMMU_CAP_CACHE_COHERENCY))
-               kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+       noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain,
+                                           IOMMU_CAP_CACHE_COHERENCY);
 
        /* Check if need to update IOMMU page table for guest memory */
-       if ((last_flags ^ kvm->arch.iommu_flags) ==
-                       KVM_IOMMU_CACHE_COHERENCY) {
+       if (noncoherent != kvm->arch.iommu_noncoherent) {
                kvm_iommu_unmap_memslots(kvm);
+               kvm->arch.iommu_noncoherent = noncoherent;
                r = kvm_iommu_map_memslots(kvm);
                if (r)
                        goto out_unmap;
@@ -190,11 +196,7 @@ int kvm_assign_device(struct kvm *kvm,
 
        pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
 
-       printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
-               assigned_dev->host_segnr,
-               assigned_dev->host_busnr,
-               PCI_SLOT(assigned_dev->host_devfn),
-               PCI_FUNC(assigned_dev->host_devfn));
+       dev_info(&pdev->dev, "kvm assign device\n");
 
        return 0;
 out_unmap:
@@ -220,11 +222,7 @@ int kvm_deassign_device(struct kvm *kvm,
 
        pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
 
-       printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
-               assigned_dev->host_segnr,
-               assigned_dev->host_busnr,
-               PCI_SLOT(assigned_dev->host_devfn),
-               PCI_FUNC(assigned_dev->host_devfn));
+       dev_info(&pdev->dev, "kvm deassign device\n");
 
        return 0;
 }
@@ -336,6 +334,9 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 
        srcu_read_unlock(&kvm->srcu, idx);
 
+       if (kvm->arch.iommu_noncoherent)
+               kvm_arch_unregister_noncoherent_dma(kvm);
+
        return 0;
 }
 
@@ -350,6 +351,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)
        mutex_lock(&kvm->slots_lock);
        kvm_iommu_unmap_memslots(kvm);
        kvm->arch.iommu_domain = NULL;
+       kvm->arch.iommu_noncoherent = false;
        mutex_unlock(&kvm->slots_lock);
 
        iommu_domain_free(domain);
index 1cf9ccb..662f34c 100644 (file)
@@ -70,7 +70,8 @@ MODULE_LICENSE("GPL");
  *             kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  */
 
-DEFINE_RAW_SPINLOCK(kvm_lock);
+DEFINE_SPINLOCK(kvm_lock);
+static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 LIST_HEAD(vm_list);
 
 static cpumask_var_t cpus_hardware_enabled;
@@ -186,6 +187,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
                ++kvm->stat.remote_tlb_flush;
        cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
+EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
@@ -490,9 +492,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
        if (r)
                goto out_err;
 
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
 
        return kvm;
 
@@ -540,13 +542,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 /*
  * Free any memory in @free but not in @dont.
  */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free,
                                  struct kvm_memory_slot *dont)
 {
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
                kvm_destroy_dirty_bitmap(free);
 
-       kvm_arch_free_memslot(free, dont);
+       kvm_arch_free_memslot(kvm, free, dont);
 
        free->npages = 0;
 }
@@ -557,7 +559,7 @@ void kvm_free_physmem(struct kvm *kvm)
        struct kvm_memory_slot *memslot;
 
        kvm_for_each_memslot(memslot, slots)
-               kvm_free_physmem_slot(memslot, NULL);
+               kvm_free_physmem_slot(kvm, memslot, NULL);
 
        kfree(kvm->memslots);
 }
@@ -581,9 +583,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
        struct mm_struct *mm = kvm->mm;
 
        kvm_arch_sync_events(kvm);
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_del(&kvm->vm_list);
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
        kvm_free_irq_routing(kvm);
        for (i = 0; i < KVM_NR_BUSES; i++)
                kvm_io_bus_destroy(kvm->buses[i]);
@@ -821,7 +823,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        if (change == KVM_MR_CREATE) {
                new.userspace_addr = mem->userspace_addr;
 
-               if (kvm_arch_create_memslot(&new, npages))
+               if (kvm_arch_create_memslot(kvm, &new, npages))
                        goto out_free;
        }
 
@@ -872,6 +874,19 @@ int __kvm_set_memory_region(struct kvm *kvm,
                        goto out_free;
        }
 
+       /* actual memory is freed via old in kvm_free_physmem_slot below */
+       if (change == KVM_MR_DELETE) {
+               new.dirty_bitmap = NULL;
+               memset(&new.arch, 0, sizeof(new.arch));
+       }
+
+       old_memslots = install_new_memslots(kvm, slots, &new);
+
+       kvm_arch_commit_memory_region(kvm, mem, &old, change);
+
+       kvm_free_physmem_slot(kvm, &old, &new);
+       kfree(old_memslots);
+
        /*
         * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
         * un-mapped and re-mapped if their base changes.  Since base change
@@ -883,29 +898,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
         */
        if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
                r = kvm_iommu_map_pages(kvm, &new);
-               if (r)
-                       goto out_slots;
-       }
-
-       /* actual memory is freed via old in kvm_free_physmem_slot below */
-       if (change == KVM_MR_DELETE) {
-               new.dirty_bitmap = NULL;
-               memset(&new.arch, 0, sizeof(new.arch));
+               return r;
        }
 
-       old_memslots = install_new_memslots(kvm, slots, &new);
-
-       kvm_arch_commit_memory_region(kvm, mem, &old, change);
-
-       kvm_free_physmem_slot(&old, &new);
-       kfree(old_memslots);
-
        return 0;
 
 out_slots:
        kfree(slots);
 out_free:
-       kvm_free_physmem_slot(&new, &old);
+       kvm_free_physmem_slot(kvm, &new, &old);
 out:
        return r;
 }
@@ -964,6 +965,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
 out:
        return r;
 }
+EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
 
 bool kvm_largepages_enabled(void)
 {
@@ -1654,6 +1656,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
        memslot = gfn_to_memslot(kvm, gfn);
        mark_page_dirty_in_slot(kvm, memslot, gfn);
 }
+EXPORT_SYMBOL_GPL(mark_page_dirty);
 
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
@@ -1679,6 +1682,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 
        finish_wait(&vcpu->wq, &wait);
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
 #ifndef CONFIG_S390
 /*
@@ -2270,6 +2274,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        case KVM_DEV_TYPE_XICS:
                ops = &kvm_xics_ops;
                break;
+#endif
+#ifdef CONFIG_KVM_VFIO
+       case KVM_DEV_TYPE_VFIO:
+               ops = &kvm_vfio_ops;
+               break;
 #endif
        default:
                return -ENODEV;
@@ -2519,44 +2528,12 @@ out:
 }
 #endif
 
-static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       struct page *page[1];
-       unsigned long addr;
-       int npages;
-       gfn_t gfn = vmf->pgoff;
-       struct kvm *kvm = vma->vm_file->private_data;
-
-       addr = gfn_to_hva(kvm, gfn);
-       if (kvm_is_error_hva(addr))
-               return VM_FAULT_SIGBUS;
-
-       npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
-                               NULL);
-       if (unlikely(npages != 1))
-               return VM_FAULT_SIGBUS;
-
-       vmf->page = page[0];
-       return 0;
-}
-
-static const struct vm_operations_struct kvm_vm_vm_ops = {
-       .fault = kvm_vm_fault,
-};
-
-static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
-{
-       vma->vm_ops = &kvm_vm_vm_ops;
-       return 0;
-}
-
 static struct file_operations kvm_vm_fops = {
        .release        = kvm_vm_release,
        .unlocked_ioctl = kvm_vm_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = kvm_vm_compat_ioctl,
 #endif
-       .mmap           = kvm_vm_mmap,
        .llseek         = noop_llseek,
 };
 
@@ -2683,11 +2660,12 @@ static void hardware_enable_nolock(void *junk)
        }
 }
 
-static void hardware_enable(void *junk)
+static void hardware_enable(void)
 {
-       raw_spin_lock(&kvm_lock);
-       hardware_enable_nolock(junk);
-       raw_spin_unlock(&kvm_lock);
+       raw_spin_lock(&kvm_count_lock);
+       if (kvm_usage_count)
+               hardware_enable_nolock(NULL);
+       raw_spin_unlock(&kvm_count_lock);
 }
 
 static void hardware_disable_nolock(void *junk)
@@ -2700,11 +2678,12 @@ static void hardware_disable_nolock(void *junk)
        kvm_arch_hardware_disable(NULL);
 }
 
-static void hardware_disable(void *junk)
+static void hardware_disable(void)
 {
-       raw_spin_lock(&kvm_lock);
-       hardware_disable_nolock(junk);
-       raw_spin_unlock(&kvm_lock);
+       raw_spin_lock(&kvm_count_lock);
+       if (kvm_usage_count)
+               hardware_disable_nolock(NULL);
+       raw_spin_unlock(&kvm_count_lock);
 }
 
 static void hardware_disable_all_nolock(void)
@@ -2718,16 +2697,16 @@ static void hardware_disable_all_nolock(void)
 
 static void hardware_disable_all(void)
 {
-       raw_spin_lock(&kvm_lock);
+       raw_spin_lock(&kvm_count_lock);
        hardware_disable_all_nolock();
-       raw_spin_unlock(&kvm_lock);
+       raw_spin_unlock(&kvm_count_lock);
 }
 
 static int hardware_enable_all(void)
 {
        int r = 0;
 
-       raw_spin_lock(&kvm_lock);
+       raw_spin_lock(&kvm_count_lock);
 
        kvm_usage_count++;
        if (kvm_usage_count == 1) {
@@ -2740,7 +2719,7 @@ static int hardware_enable_all(void)
                }
        }
 
-       raw_spin_unlock(&kvm_lock);
+       raw_spin_unlock(&kvm_count_lock);
 
        return r;
 }
@@ -2750,20 +2729,17 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 {
        int cpu = (long)v;
 
-       if (!kvm_usage_count)
-               return NOTIFY_OK;
-
        val &= ~CPU_TASKS_FROZEN;
        switch (val) {
        case CPU_DYING:
                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
                       cpu);
-               hardware_disable(NULL);
+               hardware_disable();
                break;
        case CPU_STARTING:
                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
                       cpu);
-               hardware_enable(NULL);
+               hardware_enable();
                break;
        }
        return NOTIFY_OK;
@@ -3056,10 +3032,10 @@ static int vm_stat_get(void *_offset, u64 *val)
        struct kvm *kvm;
 
        *val = 0;
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                *val += *(u32 *)((void *)kvm + offset);
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
        return 0;
 }
 
@@ -3073,12 +3049,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
        int i;
 
        *val = 0;
-       raw_spin_lock(&kvm_lock);
+       spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                kvm_for_each_vcpu(i, vcpu, kvm)
                        *val += *(u32 *)((void *)vcpu + offset);
 
-       raw_spin_unlock(&kvm_lock);
+       spin_unlock(&kvm_lock);
        return 0;
 }
 
@@ -3133,7 +3109,7 @@ static int kvm_suspend(void)
 static void kvm_resume(void)
 {
        if (kvm_usage_count) {
-               WARN_ON(raw_spin_is_locked(&kvm_lock));
+               WARN_ON(raw_spin_is_locked(&kvm_count_lock));
                hardware_enable_nolock(NULL);
        }
 }
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
new file mode 100644 (file)
index 0000000..ca4260e
--- /dev/null
@@ -0,0 +1,264 @@
+/*
+ * VFIO-KVM bridge pseudo device
+ *
+ * Copyright (C) 2013 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+struct kvm_vfio_group {
+       struct list_head node;
+       struct vfio_group *vfio_group;
+};
+
+struct kvm_vfio {
+       struct list_head group_list;
+       struct mutex lock;
+       bool noncoherent;
+};
+
+static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep)
+{
+       struct vfio_group *vfio_group;
+       struct vfio_group *(*fn)(struct file *);
+
+       fn = symbol_get(vfio_group_get_external_user);
+       if (!fn)
+               return ERR_PTR(-EINVAL);
+
+       vfio_group = fn(filep);
+
+       symbol_put(vfio_group_get_external_user);
+
+       return vfio_group;
+}
+
+static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group)
+{
+       void (*fn)(struct vfio_group *);
+
+       fn = symbol_get(vfio_group_put_external_user);
+       if (!fn)
+               return;
+
+       fn(vfio_group);
+
+       symbol_put(vfio_group_put_external_user);
+}
+
+/*
+ * Groups can use the same or different IOMMU domains.  If the same then
+ * adding a new group may change the coherency of groups we've previously
+ * been told about.  We don't want to care about any of that so we retest
+ * each group and bail as soon as we find one that's noncoherent.  This
+ * means we only ever [un]register_noncoherent_dma once for the whole device.
+ */
+static void kvm_vfio_update_coherency(struct kvm_device *dev)
+{
+       struct kvm_vfio *kv = dev->private;
+       bool noncoherent = false;
+       struct kvm_vfio_group *kvg;
+
+       mutex_lock(&kv->lock);
+
+       list_for_each_entry(kvg, &kv->group_list, node) {
+               /*
+                * TODO: We need an interface to check the coherency of
+                * the IOMMU domain this group is using.  For now, assume
+                * it's always noncoherent.
+                */
+               noncoherent = true;
+               break;
+       }
+
+       if (noncoherent != kv->noncoherent) {
+               kv->noncoherent = noncoherent;
+
+               if (kv->noncoherent)
+                       kvm_arch_register_noncoherent_dma(dev->kvm);
+               else
+                       kvm_arch_unregister_noncoherent_dma(dev->kvm);
+       }
+
+       mutex_unlock(&kv->lock);
+}
+
+static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
+{
+       struct kvm_vfio *kv = dev->private;
+       struct vfio_group *vfio_group;
+       struct kvm_vfio_group *kvg;
+       void __user *argp = (void __user *)arg;
+       struct fd f;
+       int32_t fd;
+       int ret;
+
+       switch (attr) {
+       case KVM_DEV_VFIO_GROUP_ADD:
+               if (get_user(fd, (int32_t __user *)argp))
+                       return -EFAULT;
+
+               f = fdget(fd);
+               if (!f.file)
+                       return -EBADF;
+
+               vfio_group = kvm_vfio_group_get_external_user(f.file);
+               fdput(f);
+
+               if (IS_ERR(vfio_group))
+                       return PTR_ERR(vfio_group);
+
+               mutex_lock(&kv->lock);
+
+               list_for_each_entry(kvg, &kv->group_list, node) {
+                       if (kvg->vfio_group == vfio_group) {
+                               mutex_unlock(&kv->lock);
+                               kvm_vfio_group_put_external_user(vfio_group);
+                               return -EEXIST;
+                       }
+               }
+
+               kvg = kzalloc(sizeof(*kvg), GFP_KERNEL);
+               if (!kvg) {
+                       mutex_unlock(&kv->lock);
+                       kvm_vfio_group_put_external_user(vfio_group);
+                       return -ENOMEM;
+               }
+
+               list_add_tail(&kvg->node, &kv->group_list);
+               kvg->vfio_group = vfio_group;
+
+               mutex_unlock(&kv->lock);
+
+               kvm_vfio_update_coherency(dev);
+
+               return 0;
+
+       case KVM_DEV_VFIO_GROUP_DEL:
+               if (get_user(fd, (int32_t __user *)argp))
+                       return -EFAULT;
+
+               f = fdget(fd);
+               if (!f.file)
+                       return -EBADF;
+
+               vfio_group = kvm_vfio_group_get_external_user(f.file);
+               fdput(f);
+
+               if (IS_ERR(vfio_group))
+                       return PTR_ERR(vfio_group);
+
+               ret = -ENOENT;
+
+               mutex_lock(&kv->lock);
+
+               list_for_each_entry(kvg, &kv->group_list, node) {
+                       if (kvg->vfio_group != vfio_group)
+                               continue;
+
+                       list_del(&kvg->node);
+                       kvm_vfio_group_put_external_user(kvg->vfio_group);
+                       kfree(kvg);
+                       ret = 0;
+                       break;
+               }
+
+               mutex_unlock(&kv->lock);
+
+               kvm_vfio_group_put_external_user(vfio_group);
+
+               kvm_vfio_update_coherency(dev);
+
+               return ret;
+       }
+
+       return -ENXIO;
+}
+
+static int kvm_vfio_set_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_VFIO_GROUP:
+               return kvm_vfio_set_group(dev, attr->attr, attr->addr);
+       }
+
+       return -ENXIO;
+}
+
+static int kvm_vfio_has_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_VFIO_GROUP:
+               switch (attr->attr) {
+               case KVM_DEV_VFIO_GROUP_ADD:
+               case KVM_DEV_VFIO_GROUP_DEL:
+                       return 0;
+               }
+
+               break;
+       }
+
+       return -ENXIO;
+}
+
+static void kvm_vfio_destroy(struct kvm_device *dev)
+{
+       struct kvm_vfio *kv = dev->private;
+       struct kvm_vfio_group *kvg, *tmp;
+
+       list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
+               kvm_vfio_group_put_external_user(kvg->vfio_group);
+               list_del(&kvg->node);
+               kfree(kvg);
+       }
+
+       kvm_vfio_update_coherency(dev);
+
+       kfree(kv);
+       kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */
+}
+
+static int kvm_vfio_create(struct kvm_device *dev, u32 type)
+{
+       struct kvm_device *tmp;
+       struct kvm_vfio *kv;
+
+       /* Only one VFIO "device" per VM */
+       list_for_each_entry(tmp, &dev->kvm->devices, vm_node)
+               if (tmp->ops == &kvm_vfio_ops)
+                       return -EBUSY;
+
+       kv = kzalloc(sizeof(*kv), GFP_KERNEL);
+       if (!kv)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&kv->group_list);
+       mutex_init(&kv->lock);
+
+       dev->private = kv;
+
+       return 0;
+}
+
+struct kvm_device_ops kvm_vfio_ops = {
+       .name = "kvm-vfio",
+       .create = kvm_vfio_create,
+       .destroy = kvm_vfio_destroy,
+       .set_attr = kvm_vfio_set_attr,
+       .has_attr = kvm_vfio_has_attr,
+};